diff --git a/.gitattributes b/.gitattributes index a5905e27279b18bd791be2190749cbaa579ae920..4a2228877876ecc93631f751e80f64e2ffbd901d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -38,3 +38,9 @@ checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-10650/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 9296ccc9d50720d62d591e1ba43165033a5c8819..d314d80b1172dc083bc91587b90829a50c241b6f 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ print(output["generated_text"]) ## Training procedure -[Visualize in Weights & Biases](https://wandb.ai/ahmed-heakl/huggingface/runs/ma76091v) +[Visualize in Weights & Biases](https://wandb.ai/ahmed-heakl/huggingface/runs/8iigi6ha) This model was trained with SFT. diff --git a/checkpoint-1000/model-00002-of-00002.safetensors b/checkpoint-1000/model-00002-of-00002.safetensors index 2020cf600ecec2842d0feaff8c9e558bf58124db..7bafbefb10bbb5e23f7a86a86f8c0211292420d6 100644 --- a/checkpoint-1000/model-00002-of-00002.safetensors +++ b/checkpoint-1000/model-00002-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7a9e6dc11d15833075e5e64306d68f8d615232768ab82e58d82d3867d872b08 +oid sha256:625c065241e49b903540eb6942c2c6fa3f781a3f8f221e7296e0dc0d0ad81a06 size 1481790520 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt index 4961bdb2863a4f446133b24ecdb2aceb1ce82a9e..2a9bfa45bdb2517fe106b3fe388c992484f9ebf3 100644 --- a/checkpoint-1000/optimizer.pt +++ b/checkpoint-1000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2550bac9fbfaea5f705116e90c64524a127406e9a64746d3d4b7f3ceff064aa +oid sha256:028ef400bd2e3463a5e55ce90dac80de32802b1d03b9b8071169baa5eb4412a4 size 44191162 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json index df491ed1b26b682fae6d6b145e331669dde9f359..f72e44e98692c041300d1ddc1ace3acf35a9e913 100644 --- a/checkpoint-1000/trainer_state.json +++ b/checkpoint-1000/trainer_state.json @@ -12,18 +12,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 25.0, "epoch": 0.009392427355444672, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.40625, + "grad_norm": 2.25, "learning_rate": 2e-06, - "loss": 0.5484, - "macro_f1": 0.1621621698141098, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, - "routers_loss": 0.503563642501831, + "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 @@ -31,18 +31,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 23.0, "epoch": 0.018784854710889344, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.9140625, + "grad_norm": 1.8359375, "learning_rate": 6e-06, - "loss": 0.536, - "macro_f1": 0.1621621698141098, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, - "routers_loss": 0.4589468538761139, + "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 @@ -50,37 +50,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 28.0, "epoch": 0.02817728206633402, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.375, + "grad_norm": 2.234375, "learning_rate": 1e-05, - "loss": 0.5469, - "macro_f1": 0.19999998807907104, + "loss": 0.5113, + "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, - "routers_loss": 0.5736724138259888, + "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { - "acc_repeat": 1.0, - "acc_skip": 0.5, - "avg_layers": 33.0, + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 0.03756970942177869, - "f1_execute": 0.47058823704719543, - "f1_repeat": 0.1538461595773697, - "f1_skip": 0.222222238779068, - "grad_norm": 1.8515625, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, "learning_rate": 1.4e-05, - "loss": 0.5291, - "macro_f1": 0.28221890330314636, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, - "routers_loss": 0.49970296025276184, + "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 @@ -88,37 +88,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.046962136777223364, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.953125, + "grad_norm": 1.78125, "learning_rate": 1.8e-05, - "loss": 0.5316, - "macro_f1": 0.19999998807907104, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, - "routers_loss": 0.5153562426567078, + "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 34.0, + "avg_layers": 26.0, "epoch": 0.05635456413266804, - "f1_execute": 0.5714285373687744, - "f1_repeat": 0.0, - "f1_skip": 0.25, - "grad_norm": 1.6328125, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, "learning_rate": 2.2e-05, - "loss": 0.5051, - "macro_f1": 0.2738095223903656, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, - "routers_loss": 0.46214747428894043, + "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 @@ -126,37 +126,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.06574699148811271, - "f1_execute": 0.5263157486915588, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.6e-05, - "loss": 0.5653, - "macro_f1": 0.17543858289718628, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, - "routers_loss": 0.5300976634025574, + "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 34.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 0.07513941884355738, - "f1_execute": 0.6153846383094788, + "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 1.8828125, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, "learning_rate": 3e-05, - "loss": 0.5225, - "macro_f1": 0.20512822270393372, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, - "routers_loss": 0.473240464925766, + "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 @@ -164,18 +164,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 38.0, + "avg_layers": 27.0, "epoch": 0.08453184619900206, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.6015625, + "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4867, - "macro_f1": 0.19999998807907104, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, - "routers_loss": 0.4795944094657898, + "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 @@ -183,18 +183,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 36.0, + "avg_layers": 26.0, "epoch": 0.09392427355444673, - "f1_execute": 0.6153846383094788, - "f1_repeat": 0.1538461595773697, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, - "grad_norm": 1.3984375, + "grad_norm": 1.3125, "learning_rate": 3.8e-05, - "loss": 0.4718, - "macro_f1": 0.25641027092933655, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, - "routers_loss": 0.41872408986091614, + "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 @@ -202,18 +202,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 26.0, "epoch": 0.1033167009098914, - "f1_execute": 0.6341463327407837, + "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.7734375, + "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, - "loss": 0.4472, - "macro_f1": 0.21138212084770203, + "loss": 0.404, + "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, - "routers_loss": 0.4152105450630188, + "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 @@ -221,18 +221,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 27.0, "epoch": 0.11270912826533608, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.8046875, + "grad_norm": 1.6328125, "learning_rate": 4.6e-05, - "loss": 0.4554, - "macro_f1": 0.19999998807907104, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, - "routers_loss": 0.47541096806526184, + "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 @@ -240,18 +240,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 34.0, + "avg_layers": 27.0, "epoch": 0.12210155562078075, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.875, + "grad_norm": 1.7109375, "learning_rate": 5e-05, - "loss": 0.4182, - "macro_f1": 0.2608695924282074, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, - "routers_loss": 0.37319275736808777, + "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 @@ -259,18 +259,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.13149398297622542, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4375, + "grad_norm": 1.34375, "learning_rate": 5.4e-05, - "loss": 0.3991, - "macro_f1": 0.2608695924282074, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, - "routers_loss": 0.3604123294353485, + "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 @@ -280,16 +280,16 @@ "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, - "f1_execute": 0.8979591727256775, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.421875, + "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, - "loss": 0.3827, - "macro_f1": 0.2993197441101074, + "loss": 0.341, + "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, - "routers_loss": 0.35880225896835327, + "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 @@ -297,18 +297,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 0.15027883768711475, - "f1_execute": 0.9200000166893005, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 6.2e-05, - "loss": 0.3452, - "macro_f1": 0.30666667222976685, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, - "routers_loss": 0.31086465716362, + "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 @@ -316,18 +316,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.15967126504255943, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3671875, + "grad_norm": 1.46875, "learning_rate": 6.6e-05, - "loss": 0.3283, - "macro_f1": 0.3144654333591461, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, - "routers_loss": 0.2674171030521393, + "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 @@ -335,18 +335,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.16906369239800412, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1015625, + "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, - "loss": 0.2849, - "macro_f1": 0.3205128312110901, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, - "routers_loss": 0.24587315320968628, + "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 @@ -354,18 +354,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 27.0, "epoch": 0.17845611975344877, - "f1_execute": 0.8085106015205383, + "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3046875, + "grad_norm": 1.484375, "learning_rate": 7.4e-05, - "loss": 0.2616, - "macro_f1": 0.26950353384017944, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, - "routers_loss": 0.32050269842147827, + "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 @@ -373,18 +373,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.18784854710889345, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1796875, + "grad_norm": 1.3828125, "learning_rate": 7.8e-05, - "loss": 0.2084, - "macro_f1": 0.3144654333591461, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, - "routers_loss": 0.15196125209331512, + "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 @@ -392,18 +392,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.61328125, + "grad_norm": 0.78125, "learning_rate": 8.2e-05, - "loss": 0.1947, + "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, - "routers_loss": 0.14121046662330627, + "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 @@ -416,13 +416,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.50390625, + "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, - "loss": 0.1884, + "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, - "routers_loss": 0.21312278509140015, + "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 @@ -435,13 +435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.45703125, + "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, - "loss": 0.166, + "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, - "routers_loss": 0.1184137836098671, + "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 @@ -454,13 +454,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.62890625, + "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, - "loss": 0.1313, + "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, - "routers_loss": 0.10897563397884369, + "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 @@ -468,18 +468,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.2348106838861168, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.4375, + "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, - "loss": 0.1531, - "macro_f1": 0.3272727429866791, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, - "routers_loss": 0.09979952871799469, + "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 @@ -487,18 +487,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.2442031112415615, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.515625, + "grad_norm": 0.8515625, "learning_rate": 0.000102, - "loss": 0.1265, - "macro_f1": 0.3272727429866791, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, - "routers_loss": 0.05543195456266403, + "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 @@ -511,13 +511,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.328125, + "grad_norm": 0.421875, "learning_rate": 0.000106, - "loss": 0.1436, + "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, - "routers_loss": 0.15049344301223755, + "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 @@ -530,13 +530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.263671875, + "grad_norm": 0.35546875, "learning_rate": 0.00011, - "loss": 0.1021, + "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, - "routers_loss": 0.07367338240146637, + "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 @@ -544,18 +544,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 26.0, "epoch": 0.2723803933078955, - "f1_execute": 1.0, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25, + "grad_norm": 0.271484375, "learning_rate": 0.000114, - "loss": 0.114, - "macro_f1": 0.3333333432674408, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, - "routers_loss": 0.03782692551612854, + "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 @@ -568,13 +568,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.5390625, "learning_rate": 0.000118, - "loss": 0.1197, + "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, - "routers_loss": 0.14074955880641937, + "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 @@ -587,13 +587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2353515625, "learning_rate": 0.000122, - "loss": 0.1174, + "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, - "routers_loss": 0.058013737201690674, + "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 @@ -606,13 +606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.212890625, "learning_rate": 0.000126, - "loss": 0.0911, + "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, - "routers_loss": 0.04936821386218071, + "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 @@ -625,13 +625,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.220703125, + "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, - "loss": 0.1107, + "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, - "routers_loss": 0.2628525495529175, + "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 @@ -644,13 +644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000134, - "loss": 0.1109, + "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, - "routers_loss": 0.02859785594046116, + "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 @@ -663,13 +663,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, - "loss": 0.1067, + "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, - "routers_loss": 0.10459086298942566, + "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 @@ -682,13 +682,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2109375, + "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, - "loss": 0.1166, + "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, - "routers_loss": 0.0718551054596901, + "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 @@ -701,13 +701,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1787109375, "learning_rate": 0.000146, - "loss": 0.1007, + "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, - "routers_loss": 0.1850946843624115, + "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 @@ -720,13 +720,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34375, + "grad_norm": 0.333984375, "learning_rate": 0.00015, - "loss": 0.1019, + "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, - "routers_loss": 0.09809529036283493, + "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 @@ -739,13 +739,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.26171875, "learning_rate": 0.000154, - "loss": 0.1088, + "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, - "routers_loss": 0.11277207732200623, + "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 @@ -758,13 +758,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.255859375, "learning_rate": 0.000158, - "loss": 0.0866, + "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, - "routers_loss": 0.09079254418611526, + "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 @@ -777,13 +777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1943359375, "learning_rate": 0.000162, - "loss": 0.0928, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, - "routers_loss": 0.02900076098740101, + "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 @@ -796,13 +796,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, - "loss": 0.1251, + "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, - "routers_loss": 0.0763339251279831, + "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 @@ -817,11 +817,11 @@ "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, - "loss": 0.1064, + "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, - "routers_loss": 0.13191410899162292, + "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 @@ -834,13 +834,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25390625, "learning_rate": 0.000174, - "loss": 0.1055, + "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, - "routers_loss": 0.21200031042099, + "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 @@ -853,13 +853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.349609375, "learning_rate": 0.000178, - "loss": 0.0971, + "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, - "routers_loss": 0.031911369413137436, + "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 @@ -872,13 +872,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2265625, "learning_rate": 0.000182, - "loss": 0.1056, + "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, - "routers_loss": 0.14131835103034973, + "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 @@ -891,13 +891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.205078125, "learning_rate": 0.000186, - "loss": 0.1059, + "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, - "routers_loss": 0.04137955233454704, + "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 @@ -910,13 +910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.2138671875, "learning_rate": 0.00019, - "loss": 0.0934, + "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, - "routers_loss": 0.03163003921508789, + "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 @@ -929,13 +929,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2099609375, "learning_rate": 0.000194, - "loss": 0.0847, + "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, - "routers_loss": 0.2567490339279175, + "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 @@ -948,13 +948,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30859375, + "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, - "loss": 0.1077, + "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, - "routers_loss": 0.11468870937824249, + "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 @@ -967,13 +967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1806640625, "learning_rate": 0.000202, - "loss": 0.1131, + "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, - "routers_loss": 0.02124219387769699, + "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 @@ -986,13 +986,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1943359375, "learning_rate": 0.000206, - "loss": 0.0624, + "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, - "routers_loss": 0.06983796507120132, + "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 @@ -1005,13 +1005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1591796875, "learning_rate": 0.00021, - "loss": 0.0951, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, - "routers_loss": 0.03467355668544769, + "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 @@ -1024,13 +1024,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.240234375, "learning_rate": 0.000214, - "loss": 0.0881, + "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, - "routers_loss": 0.08142061531543732, + "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 @@ -1043,13 +1043,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.267578125, "learning_rate": 0.000218, - "loss": 0.0795, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, - "routers_loss": 0.08327355235815048, + "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 @@ -1062,13 +1062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.2353515625, "learning_rate": 0.000222, - "loss": 0.0943, + "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, - "routers_loss": 0.019890006631612778, + "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 @@ -1081,13 +1081,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, - "loss": 0.0933, + "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, - "routers_loss": 0.09992363303899765, + "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 @@ -1100,13 +1100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.3046875, "learning_rate": 0.00023, - "loss": 0.0762, + "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, - "routers_loss": 0.014119029976427555, + "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 @@ -1119,13 +1119,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.423828125, + "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, - "loss": 0.0842, + "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, - "routers_loss": 0.03976766765117645, + "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 @@ -1138,13 +1138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, - "loss": 0.0517, + "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, - "routers_loss": 0.017428619787096977, + "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 @@ -1157,13 +1157,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.35546875, "learning_rate": 0.000242, - "loss": 0.1134, + "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, - "routers_loss": 0.06965513527393341, + "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 @@ -1176,13 +1176,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1875, "learning_rate": 0.000246, - "loss": 0.0984, + "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, - "routers_loss": 0.10476501286029816, + "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 @@ -1195,13 +1195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.00025, - "loss": 0.0771, + "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, - "routers_loss": 0.028317544609308243, + "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 @@ -1214,13 +1214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.390625, + "grad_norm": 0.4296875, "learning_rate": 0.000254, - "loss": 0.0933, + "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, - "routers_loss": 0.012766432017087936, + "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 @@ -1233,13 +1233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, - "loss": 0.0989, + "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, - "routers_loss": 0.021400077268481255, + "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 @@ -1252,13 +1252,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.2060546875, "learning_rate": 0.000262, - "loss": 0.0873, + "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, - "routers_loss": 0.05025051161646843, + "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 @@ -1271,13 +1271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1708984375, "learning_rate": 0.000266, - "loss": 0.085, + "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, - "routers_loss": 0.017420046031475067, + "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 @@ -1290,13 +1290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.220703125, "learning_rate": 0.00027, - "loss": 0.086, + "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, - "routers_loss": 0.018217921257019043, + "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 @@ -1309,13 +1309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, - "loss": 0.0985, + "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, - "routers_loss": 0.012350660748779774, + "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 @@ -1328,13 +1328,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, - "routers_loss": 0.14993029832839966, + "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 @@ -1347,13 +1347,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, - "loss": 0.0791, + "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, - "routers_loss": 0.17921413481235504, + "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 @@ -1366,13 +1366,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, - "loss": 0.0535, + "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, - "routers_loss": 0.1420905590057373, + "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 @@ -1385,13 +1385,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.29296875, + "grad_norm": 0.306640625, "learning_rate": 0.00029, - "loss": 0.0956, + "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, - "routers_loss": 0.12468750029802322, + "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 @@ -1404,13 +1404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1806640625, "learning_rate": 0.000294, - "loss": 0.0879, + "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, - "routers_loss": 0.024295611307024956, + "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 @@ -1423,13 +1423,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.000298, - "loss": 0.087, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, - "routers_loss": 0.07016433775424957, + "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 @@ -1442,13 +1442,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3828125, + "grad_norm": 0.37890625, "learning_rate": 0.000302, - "loss": 0.0782, + "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, - "routers_loss": 0.18942493200302124, + "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 @@ -1461,13 +1461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1787109375, "learning_rate": 0.000306, - "loss": 0.0713, + "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, - "routers_loss": 0.02319060079753399, + "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 @@ -1480,13 +1480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15234375, + "grad_norm": 0.1533203125, "learning_rate": 0.00031, - "loss": 0.0778, + "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, - "routers_loss": 0.01764747127890587, + "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 @@ -1499,13 +1499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1572265625, "learning_rate": 0.000314, - "loss": 0.0829, + "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, - "routers_loss": 0.02268100716173649, + "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 @@ -1518,13 +1518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, - "loss": 0.0889, + "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, - "routers_loss": 0.016952091827988625, + "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 @@ -1537,13 +1537,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2216796875, + "grad_norm": 0.224609375, "learning_rate": 0.000322, - "loss": 0.0923, + "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, - "routers_loss": 0.03669808804988861, + "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 @@ -1556,13 +1556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.212890625, "learning_rate": 0.000326, - "loss": 0.0769, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, - "routers_loss": 0.012101447209715843, + "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 @@ -1575,13 +1575,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.37109375, + "grad_norm": 0.408203125, "learning_rate": 0.00033, - "loss": 0.0897, + "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, - "routers_loss": 0.1562056541442871, + "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 @@ -1594,13 +1594,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, - "loss": 0.0829, + "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, - "routers_loss": 0.20807914435863495, + "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 @@ -1613,13 +1613,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, - "loss": 0.0987, + "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, - "routers_loss": 0.1530539095401764, + "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 @@ -1632,13 +1632,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.17578125, "learning_rate": 0.000342, - "loss": 0.087, + "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, - "routers_loss": 0.08004544675350189, + "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 @@ -1651,13 +1651,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.189453125, "learning_rate": 0.000346, - "loss": 0.0916, + "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, - "routers_loss": 0.19228078424930573, + "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 @@ -1670,13 +1670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1494140625, "learning_rate": 0.00035, - "loss": 0.0863, + "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, - "routers_loss": 0.024507170543074608, + "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 @@ -1689,13 +1689,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2158203125, "learning_rate": 0.000354, - "loss": 0.0898, + "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, - "routers_loss": 0.05055495724081993, + "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 @@ -1708,13 +1708,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.240234375, "learning_rate": 0.000358, - "loss": 0.0865, + "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, - "routers_loss": 0.03999815881252289, + "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 @@ -1727,13 +1727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.14453125, "learning_rate": 0.000362, - "loss": 0.0983, + "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, - "routers_loss": 0.025158070027828217, + "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 @@ -1746,32 +1746,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.158203125, "learning_rate": 0.000366, - "loss": 0.1015, + "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, - "routers_loss": 0.01825365424156189, + "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 0.8734957440563546, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, "learning_rate": 0.00037, - "loss": 0.0736, - "macro_f1": 0.3144654333591461, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, - "routers_loss": 0.22729666531085968, + "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 @@ -1784,13 +1784,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2041015625, "learning_rate": 0.000374, - "loss": 0.0838, + "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, - "routers_loss": 0.24516475200653076, + "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 @@ -1803,13 +1803,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2470703125, + "grad_norm": 0.271484375, "learning_rate": 0.000378, - "loss": 0.1056, + "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, - "routers_loss": 0.1307530701160431, + "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 @@ -1822,13 +1822,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15625, "learning_rate": 0.000382, - "loss": 0.0961, + "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, - "routers_loss": 0.06541688740253448, + "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 @@ -1841,13 +1841,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.34375, "learning_rate": 0.000386, - "loss": 0.1058, + "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, - "routers_loss": 0.12492545694112778, + "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 @@ -1860,13 +1860,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28515625, + "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, - "loss": 0.0966, + "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, - "routers_loss": 0.2838033139705658, + "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 @@ -1881,11 +1881,11 @@ "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, - "loss": 0.0929, + "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, - "routers_loss": 0.07692629098892212, + "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 @@ -1898,13 +1898,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, - "routers_loss": 0.18504399061203003, + "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 @@ -1917,13 +1917,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2490234375, "learning_rate": 0.000402, - "loss": 0.078, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, - "routers_loss": 0.014647359028458595, + "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 @@ -1936,13 +1936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, - "loss": 0.1028, + "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, - "routers_loss": 0.017848484218120575, + "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 @@ -1955,13 +1955,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.27734375, "learning_rate": 0.00041, - "loss": 0.0832, + "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, - "routers_loss": 0.01900508813560009, + "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 @@ -1974,13 +1974,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, - "routers_loss": 0.13018715381622314, + "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 @@ -1993,13 +1993,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, - "loss": 0.0697, + "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, - "routers_loss": 0.055288366973400116, + "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 @@ -2012,13 +2012,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.271484375, "learning_rate": 0.000422, - "loss": 0.0576, + "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, - "routers_loss": 0.10952572524547577, + "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 @@ -2031,13 +2031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.197265625, + "grad_norm": 0.2060546875, "learning_rate": 0.000426, - "loss": 0.062, + "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, - "routers_loss": 0.02415696159005165, + "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 @@ -2050,13 +2050,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.00043, - "loss": 0.1011, + "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, - "routers_loss": 0.06956391036510468, + "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 @@ -2069,13 +2069,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, - "loss": 0.076, + "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, - "routers_loss": 0.1140352189540863, + "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 @@ -2090,11 +2090,11 @@ "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, - "loss": 0.0788, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, - "routers_loss": 0.011621571145951748, + "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 @@ -2107,13 +2107,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, - "routers_loss": 0.05813701078295708, + "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 @@ -2126,13 +2126,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.40234375, "learning_rate": 0.000446, - "loss": 0.0827, + "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, - "routers_loss": 0.0646885335445404, + "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 @@ -2145,13 +2145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, - "loss": 0.1011, + "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, - "routers_loss": 0.07224348932504654, + "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 @@ -2164,13 +2164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, - "loss": 0.0781, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, - "routers_loss": 0.015971746295690536, + "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 @@ -2183,13 +2183,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25, "learning_rate": 0.000458, - "loss": 0.099, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, - "routers_loss": 0.017818331718444824, + "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 @@ -2202,13 +2202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1494140625, "learning_rate": 0.000462, - "loss": 0.0757, + "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, - "routers_loss": 0.01582280732691288, + "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 @@ -2221,13 +2221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.42578125, + "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, - "loss": 0.0876, + "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, - "routers_loss": 0.011417915113270283, + "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 @@ -2240,13 +2240,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.17578125, "learning_rate": 0.00047, - "loss": 0.0801, + "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, - "routers_loss": 0.05787832289934158, + "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 @@ -2259,13 +2259,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.267578125, "learning_rate": 0.000474, - "loss": 0.0508, + "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, - "routers_loss": 0.09476690739393234, + "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 @@ -2278,13 +2278,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, - "loss": 0.0833, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, - "routers_loss": 0.1099705696105957, + "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 @@ -2297,13 +2297,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.171875, "learning_rate": 0.000482, - "loss": 0.0745, + "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, - "routers_loss": 0.01269970741122961, + "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 @@ -2316,13 +2316,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.12060546875, "learning_rate": 0.000486, - "loss": 0.061, + "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, - "routers_loss": 0.08505752682685852, + "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 @@ -2335,13 +2335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1552734375, "learning_rate": 0.00049, - "loss": 0.0504, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, - "routers_loss": 0.012750142253935337, + "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 @@ -2354,13 +2354,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.291015625, + "grad_norm": 0.296875, "learning_rate": 0.000494, - "loss": 0.0962, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, - "routers_loss": 0.11287309974431992, + "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 @@ -2373,32 +2373,32 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.203125, "learning_rate": 0.000498, - "loss": 0.0821, + "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, - "routers_loss": 0.1486474722623825, + "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.183152333431171, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, + "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, - "loss": 0.0832, - "macro_f1": 0.5492662787437439, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, - "routers_loss": 0.06636594980955124, + "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 @@ -2411,13 +2411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.267578125, + "grad_norm": 0.287109375, "learning_rate": 0.000506, - "loss": 0.1, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, - "routers_loss": 0.015062150545418262, + "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 @@ -2430,13 +2430,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.21484375, "learning_rate": 0.00051, - "loss": 0.0808, + "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, - "routers_loss": 0.2051105946302414, + "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 @@ -2449,13 +2449,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.2421875, "learning_rate": 0.000514, - "loss": 0.068, + "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, - "routers_loss": 0.1467045396566391, + "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 @@ -2468,13 +2468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1533203125, "learning_rate": 0.000518, - "loss": 0.0543, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, - "routers_loss": 0.013022038154304028, + "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 @@ -2487,13 +2487,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2294921875, "learning_rate": 0.000522, - "loss": 0.0848, + "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, - "routers_loss": 0.2575930058956146, + "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 @@ -2506,13 +2506,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.17578125, "learning_rate": 0.000526, - "loss": 0.07, + "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, - "routers_loss": 0.0558602549135685, + "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 @@ -2525,13 +2525,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, - "loss": 0.082, + "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, - "routers_loss": 0.09126655012369156, + "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 @@ -2544,13 +2544,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, - "loss": 0.0764, + "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, - "routers_loss": 0.24805288016796112, + "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 @@ -2563,13 +2563,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, - "loss": 0.0686, + "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, - "routers_loss": 0.13135533034801483, + "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 @@ -2582,13 +2582,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, - "loss": 0.1083, + "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, - "routers_loss": 0.04991440102458, + "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 @@ -2601,13 +2601,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.455078125, + "grad_norm": 0.44921875, "learning_rate": 0.000546, - "loss": 0.0991, + "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, - "routers_loss": 0.12236632406711578, + "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 @@ -2620,13 +2620,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25, + "grad_norm": 0.2578125, "learning_rate": 0.00055, - "loss": 0.0936, + "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, - "routers_loss": 0.053506772965192795, + "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 @@ -2639,13 +2639,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.29296875, "learning_rate": 0.000554, - "loss": 0.066, + "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, - "routers_loss": 0.13446088135242462, + "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 @@ -2658,32 +2658,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.185546875, "learning_rate": 0.000558, - "loss": 0.0682, + "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, - "routers_loss": 0.07270720601081848, + "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.3240387437628411, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.28125, + "f1_skip": 0.0, + "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, - "loss": 0.0648, - "macro_f1": 0.5427350401878357, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, - "routers_loss": 0.13866399228572845, + "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 @@ -2696,13 +2696,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.265625, "learning_rate": 0.000566, - "loss": 0.0782, + "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, - "routers_loss": 0.0645354762673378, + "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 @@ -2715,13 +2715,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1650390625, "learning_rate": 0.00057, - "loss": 0.0892, + "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, - "routers_loss": 0.05967628210783005, + "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 @@ -2734,13 +2734,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2158203125, "learning_rate": 0.000574, - "loss": 0.0676, + "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, - "routers_loss": 0.06438407301902771, + "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 @@ -2753,13 +2753,13 @@ "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28515625, "learning_rate": 0.000578, - "loss": 0.0781, + "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, - "routers_loss": 0.21225209534168243, + "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 @@ -2772,13 +2772,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, - "loss": 0.0664, + "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, - "routers_loss": 0.08085516840219498, + "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 @@ -2791,13 +2791,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, - "loss": 0.0874, + "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, - "routers_loss": 0.05378658324480057, + "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 @@ -2810,13 +2810,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.2177734375, "learning_rate": 0.00059, - "loss": 0.0715, + "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, - "routers_loss": 0.01145261898636818, + "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 @@ -2831,11 +2831,11 @@ "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, - "loss": 0.0737, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, - "routers_loss": 0.009397956542670727, + "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 @@ -2848,13 +2848,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.181640625, "learning_rate": 0.000598, - "loss": 0.0802, + "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, - "routers_loss": 0.2389357089996338, + "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 @@ -2862,18 +2862,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 1.417963017317288, - "f1_execute": 0.9019607901573181, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2080078125, "learning_rate": 0.000602, - "loss": 0.0745, - "macro_f1": 0.3006536066532135, + "loss": 0.073, + "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, - "routers_loss": 0.18252353370189667, + "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 @@ -2886,13 +2886,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.27734375, + "grad_norm": 0.279296875, "learning_rate": 0.000606, - "loss": 0.0935, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, - "routers_loss": 0.18185268342494965, + "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 @@ -2905,13 +2905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1943359375, "learning_rate": 0.00061, - "loss": 0.0853, + "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, - "routers_loss": 0.013210167177021503, + "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 @@ -2924,13 +2924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.26953125, "learning_rate": 0.000614, - "loss": 0.1089, + "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, - "routers_loss": 0.016936838626861572, + "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 @@ -2943,13 +2943,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, - "loss": 0.077, + "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, - "routers_loss": 0.08630389720201492, + "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 @@ -2962,13 +2962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.19140625, "learning_rate": 0.000622, - "loss": 0.0602, + "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, - "routers_loss": 0.013665963895618916, + "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 @@ -2981,13 +2981,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.205078125, "learning_rate": 0.000626, - "loss": 0.0794, + "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, - "routers_loss": 0.01584783010184765, + "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 @@ -3000,13 +3000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2216796875, "learning_rate": 0.00063, - "loss": 0.0762, + "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, - "routers_loss": 0.01368923019617796, + "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 @@ -3019,13 +3019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.388671875, + "grad_norm": 0.400390625, "learning_rate": 0.000634, - "loss": 0.0908, + "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, - "routers_loss": 0.009135022759437561, + "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 @@ -3038,13 +3038,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.15234375, "learning_rate": 0.000638, - "loss": 0.0949, + "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, - "routers_loss": 0.046641621738672256, + "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 @@ -3052,18 +3052,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.5118872908717347, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.259765625, "learning_rate": 0.000642, - "loss": 0.0925, - "macro_f1": 0.3333333432674408, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, - "routers_loss": 0.020637936890125275, + "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 @@ -3076,13 +3076,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26953125, + "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, - "routers_loss": 0.08289298415184021, + "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 @@ -3090,18 +3090,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.530672145582624, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, - "loss": 0.0823, - "macro_f1": 0.3272727429866791, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, - "routers_loss": 0.06960040330886841, + "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 @@ -3114,13 +3114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, - "loss": 0.0799, + "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, - "routers_loss": 0.02087482251226902, + "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 @@ -3133,13 +3133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, - "loss": 0.0757, + "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, - "routers_loss": 0.016592051833868027, + "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 @@ -3152,32 +3152,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.220703125, "learning_rate": 0.000662, - "loss": 0.0438, + "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, - "routers_loss": 0.012950568459928036, + "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 1.5682418550044028, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.296875, "learning_rate": 0.000666, - "loss": 0.0964, - "macro_f1": 0.29333335161209106, + "loss": 0.0963, + "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, - "routers_loss": 0.3373340964317322, + "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 @@ -3190,13 +3190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, - "routers_loss": 0.008110735565423965, + "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 @@ -3209,13 +3209,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.2421875, "learning_rate": 0.000674, - "loss": 0.0771, + "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, - "routers_loss": 0.01841609925031662, + "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 @@ -3228,13 +3228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, - "loss": 0.0894, + "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, - "routers_loss": 0.01612614095211029, + "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 @@ -3247,13 +3247,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, - "loss": 0.0611, + "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, - "routers_loss": 0.26202192902565, + "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 @@ -3266,13 +3266,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, - "loss": 0.1013, + "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, - "routers_loss": 0.09235779196023941, + "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 @@ -3285,13 +3285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.255859375, "learning_rate": 0.00069, - "loss": 0.0856, + "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, - "routers_loss": 0.010735333897173405, + "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 @@ -3304,13 +3304,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2138671875, "learning_rate": 0.000694, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, - "routers_loss": 0.14742356538772583, + "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 @@ -3323,13 +3323,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30859375, + "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, - "loss": 0.0614, + "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, - "routers_loss": 0.06606879830360413, + "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 @@ -3342,13 +3342,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.322265625, + "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, - "loss": 0.1033, + "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, - "routers_loss": 0.012873432599008083, + "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 @@ -3361,13 +3361,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, - "loss": 0.0819, + "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, - "routers_loss": 0.07853665202856064, + "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 @@ -3380,13 +3380,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.263671875, "learning_rate": 0.00071, - "loss": 0.0804, + "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, - "routers_loss": 0.2216549813747406, + "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 @@ -3399,13 +3399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1884765625, "learning_rate": 0.000714, - "loss": 0.0675, + "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, - "routers_loss": 0.02423691377043724, + "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 @@ -3413,18 +3413,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.6903434106251836, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.21484375, "learning_rate": 0.000718, - "loss": 0.0781, - "macro_f1": 0.3272727429866791, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, - "routers_loss": 0.07496294379234314, + "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 @@ -3437,13 +3437,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.197265625, "learning_rate": 0.000722, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, - "routers_loss": 0.08181872963905334, + "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 @@ -3456,13 +3456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2216796875, "learning_rate": 0.000726, - "loss": 0.1112, + "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, - "routers_loss": 0.016959719359874725, + "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 @@ -3475,13 +3475,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.15625, "learning_rate": 0.00073, - "loss": 0.0577, + "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, - "routers_loss": 0.13295969367027283, + "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 @@ -3494,13 +3494,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.150390625, "learning_rate": 0.000734, - "loss": 0.0986, + "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, - "routers_loss": 0.02476893551647663, + "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 @@ -3513,13 +3513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1796875, "learning_rate": 0.000738, - "loss": 0.0682, + "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, - "routers_loss": 0.019863395020365715, + "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 @@ -3532,13 +3532,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2412109375, "learning_rate": 0.000742, - "loss": 0.0663, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, - "routers_loss": 0.07230417430400848, + "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 @@ -3551,13 +3551,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2412109375, "learning_rate": 0.000746, - "loss": 0.0986, + "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, - "routers_loss": 0.11727793514728546, + "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 @@ -3570,13 +3570,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2265625, "learning_rate": 0.00075, - "loss": 0.0724, + "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, - "routers_loss": 0.13495951890945435, + "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 @@ -3589,13 +3589,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.2333984375, "learning_rate": 0.000754, - "loss": 0.0823, + "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, - "routers_loss": 0.07612533867359161, + "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 @@ -3608,13 +3608,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1826171875, "learning_rate": 0.000758, - "loss": 0.0803, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, - "routers_loss": 0.0484120175242424, + "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 @@ -3627,13 +3627,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1689453125, "learning_rate": 0.000762, - "loss": 0.0866, + "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, - "routers_loss": 0.10939671844244003, + "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 @@ -3646,13 +3646,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, - "loss": 0.1083, + "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, - "routers_loss": 0.11382336914539337, + "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 @@ -3667,11 +3667,11 @@ "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, - "loss": 0.0616, + "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, - "routers_loss": 0.07494530081748962, + "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 @@ -3684,13 +3684,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, - "loss": 0.0816, + "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, - "routers_loss": 0.05718417093157768, + "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 @@ -3703,13 +3703,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.2099609375, "learning_rate": 0.000778, - "loss": 0.0783, + "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, - "routers_loss": 0.2848989963531494, + "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 @@ -3722,13 +3722,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30078125, + "grad_norm": 0.30859375, "learning_rate": 0.000782, - "loss": 0.0608, + "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, - "routers_loss": 0.2050076276063919, + "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 @@ -3741,13 +3741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.29296875, "learning_rate": 0.000786, - "loss": 0.0863, + "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, - "routers_loss": 0.020946886390447617, + "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 @@ -3760,13 +3760,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.376953125, + "grad_norm": 0.37890625, "learning_rate": 0.00079, - "loss": 0.0798, + "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, - "routers_loss": 0.1270289123058319, + "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 @@ -3779,13 +3779,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, - "loss": 0.0701, + "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, - "routers_loss": 0.08012636005878448, + "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 @@ -3798,13 +3798,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, - "loss": 0.0901, + "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, - "routers_loss": 0.09315784275531769, + "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 @@ -3817,13 +3817,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, - "loss": 0.078, + "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, - "routers_loss": 0.18492189049720764, + "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 @@ -3836,13 +3836,13 @@ "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, - "loss": 0.0801, + "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, - "routers_loss": 0.32641324400901794, + "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 @@ -3855,13 +3855,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, - "loss": 0.0905, + "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, - "routers_loss": 0.02722037397325039, + "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 @@ -3874,13 +3874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, - "loss": 0.0958, + "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, - "routers_loss": 0.010129833593964577, + "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 @@ -3893,13 +3893,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2373046875, + "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, - "loss": 0.1084, + "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, - "routers_loss": 0.07298308610916138, + "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 @@ -3912,13 +3912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, - "loss": 0.0802, + "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, - "routers_loss": 0.024257874116301537, + "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 @@ -3931,13 +3931,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1650390625, "learning_rate": 0.000826, - "loss": 0.0842, + "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, - "routers_loss": 0.048864223062992096, + "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 @@ -3950,13 +3950,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1513671875, "learning_rate": 0.00083, - "loss": 0.1026, + "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, - "routers_loss": 0.1592330038547516, + "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 @@ -3969,13 +3969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.263671875, "learning_rate": 0.000834, - "loss": 0.0963, + "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, - "routers_loss": 0.02291976846754551, + "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 @@ -3988,13 +3988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10888671875, "learning_rate": 0.000838, - "loss": 0.0634, + "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, - "routers_loss": 0.010272650048136711, + "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 @@ -4007,13 +4007,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.263671875, "learning_rate": 0.000842, - "loss": 0.0786, + "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, - "routers_loss": 0.0692613497376442, + "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 @@ -4026,13 +4026,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.1318359375, "learning_rate": 0.000846, - "loss": 0.0706, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, - "routers_loss": 0.12713804841041565, + "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 @@ -4045,13 +4045,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2158203125, "learning_rate": 0.00085, - "loss": 0.0758, + "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, - "routers_loss": 0.08670130372047424, + "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 @@ -4064,13 +4064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.23828125, "learning_rate": 0.000854, - "loss": 0.0857, + "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, - "routers_loss": 0.01053862925618887, + "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 @@ -4083,13 +4083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1435546875, "learning_rate": 0.000858, - "loss": 0.0615, + "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, - "routers_loss": 0.012946994043886662, + "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 @@ -4102,13 +4102,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1552734375, "learning_rate": 0.000862, - "loss": 0.0498, + "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, - "routers_loss": 0.08222822099924088, + "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 @@ -4121,13 +4121,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.173828125, "learning_rate": 0.000866, - "loss": 0.0532, + "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, - "routers_loss": 0.07086442410945892, + "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 @@ -4140,13 +4140,13 @@ "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1943359375, "learning_rate": 0.00087, - "loss": 0.0825, + "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, - "routers_loss": 0.29007306694984436, + "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 @@ -4159,13 +4159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.396484375, + "grad_norm": 0.423828125, "learning_rate": 0.000874, - "loss": 0.0658, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, - "routers_loss": 0.014652491547167301, + "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 @@ -4178,13 +4178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.000878, - "loss": 0.0685, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, - "routers_loss": 0.013720969669520855, + "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 @@ -4197,13 +4197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.171875, "learning_rate": 0.000882, - "loss": 0.0771, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, - "routers_loss": 0.011687638238072395, + "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 @@ -4216,13 +4216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, - "loss": 0.0604, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, - "routers_loss": 0.007869532331824303, + "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 @@ -4230,18 +4230,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.0939242735544465, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, - "loss": 0.0797, - "macro_f1": 0.3076923191547394, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, - "routers_loss": 0.3034668564796448, + "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 @@ -4254,13 +4254,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.203125, "learning_rate": 0.000894, - "loss": 0.0823, + "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, - "routers_loss": 0.11066079139709473, + "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 @@ -4273,13 +4273,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.33984375, "learning_rate": 0.000898, - "loss": 0.0773, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, - "routers_loss": 0.0755370482802391, + "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 @@ -4292,13 +4292,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.3203125, "learning_rate": 0.000902, - "loss": 0.0596, + "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, - "routers_loss": 0.08470689505338669, + "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 @@ -4311,13 +4311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.1953125, "learning_rate": 0.000906, - "loss": 0.0608, + "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, - "routers_loss": 0.0130238626152277, + "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 @@ -4330,13 +4330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.00091, - "loss": 0.0652, + "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, - "routers_loss": 0.007108641788363457, + "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 @@ -4351,11 +4351,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, - "loss": 0.0746, + "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, - "routers_loss": 0.06834109872579575, + "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 @@ -4368,13 +4368,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, - "loss": 0.0733, + "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, - "routers_loss": 0.10230778902769089, + "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 @@ -4387,13 +4387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, - "loss": 0.0528, + "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, - "routers_loss": 0.009987542405724525, + "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 @@ -4406,13 +4406,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, - "loss": 0.0536, + "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, - "routers_loss": 0.03448869287967682, + "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 @@ -4425,13 +4425,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.173828125, "learning_rate": 0.00093, - "loss": 0.053, + "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, - "routers_loss": 0.13631699979305267, + "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 @@ -4444,13 +4444,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.142578125, "learning_rate": 0.000934, - "loss": 0.06, + "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, - "routers_loss": 0.053951870650053024, + "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 @@ -4463,13 +4463,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, - "loss": 0.059, + "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, - "routers_loss": 0.14479905366897583, + "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 @@ -4482,13 +4482,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.44140625, + "grad_norm": 0.5, "learning_rate": 0.000942, - "loss": 0.0913, + "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, - "routers_loss": 0.056221429258584976, + "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 @@ -4501,13 +4501,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.212890625, "learning_rate": 0.000946, - "loss": 0.0591, + "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, - "routers_loss": 0.09729792177677155, + "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 @@ -4520,13 +4520,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1259765625, "learning_rate": 0.00095, - "loss": 0.0496, + "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, - "routers_loss": 0.029447713866829872, + "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 @@ -4539,13 +4539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.271484375, + "grad_norm": 0.291015625, "learning_rate": 0.000954, - "loss": 0.0801, + "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, - "routers_loss": 0.09337342530488968, + "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 @@ -4560,11 +4560,11 @@ "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, - "loss": 0.1102, + "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, - "routers_loss": 0.23193210363388062, + "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 @@ -4572,18 +4572,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.2629879659524508, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.146484375, "learning_rate": 0.000962, - "loss": 0.0669, - "macro_f1": 0.3272727429866791, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, - "routers_loss": 0.046257760375738144, + "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 @@ -4596,13 +4596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.197265625, "learning_rate": 0.000966, - "loss": 0.0552, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, - "routers_loss": 0.01683143898844719, + "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 @@ -4615,13 +4615,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, - "loss": 0.071, + "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, - "routers_loss": 0.04129387438297272, + "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 @@ -4634,13 +4634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000974, - "loss": 0.0605, + "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, - "routers_loss": 0.01262948103249073, + "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 @@ -4653,13 +4653,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.263671875, "learning_rate": 0.000978, - "loss": 0.081, + "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, - "routers_loss": 0.07404553890228271, + "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 @@ -4672,13 +4672,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2158203125, "learning_rate": 0.000982, - "loss": 0.0751, + "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, - "routers_loss": 0.06795930862426758, + "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 @@ -4691,13 +4691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, - "loss": 0.0804, + "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, - "routers_loss": 0.02233024686574936, + "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 @@ -4710,13 +4710,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2138671875, "learning_rate": 0.00099, - "loss": 0.0731, + "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, - "routers_loss": 0.07979031652212143, + "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 @@ -4729,13 +4729,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1298828125, + "grad_norm": 0.130859375, "learning_rate": 0.000994, - "loss": 0.0795, + "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, - "routers_loss": 0.045646365731954575, + "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 @@ -4748,13 +4748,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, - "routers_loss": 0.09717849642038345, + "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 @@ -4767,13 +4767,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30078125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, - "loss": 0.0894, + "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, - "routers_loss": 0.03948225453495979, + "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 @@ -4786,13 +4786,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, - "loss": 0.0557, + "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, - "routers_loss": 0.0742638111114502, + "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 @@ -4805,13 +4805,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.25, "learning_rate": 0.000999999401247153, - "loss": 0.0682, + "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, - "routers_loss": 0.08293049037456512, + "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 @@ -4824,13 +4824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, - "loss": 0.0697, + "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, - "routers_loss": 0.010080376639962196, + "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 @@ -4843,13 +4843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, - "loss": 0.0611, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, - "routers_loss": 0.009179878048598766, + "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 @@ -4862,13 +4862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11083984375, + "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, - "loss": 0.0689, + "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, - "routers_loss": 0.006718529388308525, + "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 @@ -4881,13 +4881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, - "loss": 0.0826, + "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, - "routers_loss": 0.049344487488269806, + "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 @@ -4900,13 +4900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, - "loss": 0.0739, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, - "routers_loss": 0.013402626849710941, + "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 @@ -4919,13 +4919,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, - "loss": 0.0761, + "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, - "routers_loss": 0.16964484751224518, + "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 @@ -4938,13 +4938,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, - "loss": 0.095, + "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, - "routers_loss": 0.08609295636415482, + "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 @@ -4957,13 +4957,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2392578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, - "loss": 0.0816, + "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, - "routers_loss": 0.05354784056544304, + "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 @@ -4976,13 +4976,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.2236328125, + "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, - "loss": 0.0715, + "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, - "routers_loss": 0.09146631509065628, + "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 @@ -4995,13 +4995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, - "loss": 0.0574, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, - "routers_loss": 0.02344255894422531, + "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 @@ -5014,13 +5014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, - "loss": 0.0621, + "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, - "routers_loss": 0.018271517008543015, + "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 @@ -5033,13 +5033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, - "loss": 0.0717, + "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, - "routers_loss": 0.026990914717316628, + "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 @@ -5052,13 +5052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, - "loss": 0.0681, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, - "routers_loss": 0.019737249240279198, + "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 @@ -5071,13 +5071,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.3046875, + "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, - "loss": 0.0978, + "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, - "routers_loss": 0.212640181183815, + "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 @@ -5090,13 +5090,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, - "loss": 0.0602, + "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, - "routers_loss": 0.07302755117416382, + "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 @@ -5109,13 +5109,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, - "loss": 0.0825, + "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, - "routers_loss": 0.08667246252298355, + "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 @@ -5128,13 +5128,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, - "loss": 0.0597, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, - "routers_loss": 0.015047167427837849, + "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 @@ -5147,13 +5147,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, - "loss": 0.076, + "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, - "routers_loss": 0.07481446117162704, + "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 @@ -5166,13 +5166,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, - "loss": 0.0724, + "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, - "routers_loss": 0.049495212733745575, + "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 @@ -5185,13 +5185,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, - "loss": 0.0718, + "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, - "routers_loss": 0.08043002337217331, + "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 @@ -5204,13 +5204,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34765625, + "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, - "loss": 0.086, + "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, - "routers_loss": 0.22461950778961182, + "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 @@ -5223,13 +5223,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, - "loss": 0.0798, + "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, - "routers_loss": 0.11754962801933289, + "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 @@ -5242,13 +5242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, - "loss": 0.0978, + "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, - "routers_loss": 0.017412789165973663, + "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 @@ -5261,13 +5261,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, - "loss": 0.0792, + "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, - "routers_loss": 0.08969525247812271, + "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 @@ -5280,13 +5280,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, - "loss": 0.0803, + "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, - "routers_loss": 0.09876437485218048, + "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 @@ -5299,13 +5299,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, - "loss": 0.0887, + "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, - "routers_loss": 0.019108204171061516, + "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 @@ -5318,32 +5318,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, - "loss": 0.0573, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, - "routers_loss": 0.019048813730478287, + "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 2.638685060170238, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "f1_skip": 0.5, + "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, - "loss": 0.0947, - "macro_f1": 0.3144654333591461, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, - "routers_loss": 0.11889495700597763, + "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 @@ -5356,13 +5356,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, - "loss": 0.0771, + "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, - "routers_loss": 0.06202332302927971, + "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 @@ -5375,13 +5375,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, - "loss": 0.0623, + "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, - "routers_loss": 0.08294244855642319, + "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 @@ -5394,13 +5394,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, - "loss": 0.0885, + "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, - "routers_loss": 0.07545182853937149, + "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 @@ -5413,13 +5413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, - "loss": 0.0712, + "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, - "routers_loss": 0.008711219765245914, + "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 @@ -5432,13 +5432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, - "loss": 0.0837, + "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, - "routers_loss": 0.01639273390173912, + "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 @@ -5451,13 +5451,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, - "loss": 0.0707, + "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, - "routers_loss": 0.04997137933969498, + "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 @@ -5470,13 +5470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, - "loss": 0.0799, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, - "routers_loss": 0.011360209435224533, + "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 @@ -5489,13 +5489,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, - "loss": 0.0658, + "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, - "routers_loss": 0.31349560618400574, + "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 @@ -5508,13 +5508,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, - "loss": 0.0801, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, - "routers_loss": 0.058660347014665604, + "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 @@ -5527,13 +5527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, - "loss": 0.0578, + "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, - "routers_loss": 0.025836754590272903, + "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 @@ -5546,13 +5546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, - "loss": 0.0683, + "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, - "routers_loss": 0.016504868865013123, + "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 @@ -5565,13 +5565,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, - "loss": 0.0685, + "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, - "routers_loss": 0.1379794180393219, + "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 @@ -5584,13 +5584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, - "loss": 0.0657, + "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, - "routers_loss": 0.01802757754921913, + "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 @@ -5603,13 +5603,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, - "loss": 0.0762, + "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, - "routers_loss": 0.006902900990098715, + "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 @@ -5622,13 +5622,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, - "loss": 0.0912, + "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, - "routers_loss": 0.08348741382360458, + "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 @@ -5641,13 +5641,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, - "loss": 0.0527, + "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, - "routers_loss": 0.08290062099695206, + "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 @@ -5660,13 +5660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, - "loss": 0.0643, + "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, - "routers_loss": 0.018620988354086876, + "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 @@ -5679,13 +5679,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, - "loss": 0.073, + "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, - "routers_loss": 0.1211671382188797, + "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 @@ -5698,13 +5698,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, - "loss": 0.079, + "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, - "routers_loss": 0.15485027432441711, + "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 @@ -5717,13 +5717,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, - "loss": 0.0562, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, - "routers_loss": 0.036684274673461914, + "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 @@ -5731,18 +5731,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.835926034634576, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, - "loss": 0.0985, - "macro_f1": 0.3333333432674408, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, - "routers_loss": 0.026901578530669212, + "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 @@ -5755,13 +5755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, - "loss": 0.0632, + "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, - "routers_loss": 0.01700405217707157, + "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 @@ -5774,13 +5774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, - "loss": 0.0758, + "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, - "routers_loss": 0.015013590455055237, + "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 @@ -5793,13 +5793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, - "loss": 0.0576, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, - "routers_loss": 0.02037946693599224, + "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 @@ -5812,13 +5812,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, - "loss": 0.0648, + "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, - "routers_loss": 0.22834022343158722, + "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 @@ -5831,13 +5831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, - "loss": 0.0449, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, - "routers_loss": 0.009838113561272621, + "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 @@ -5850,13 +5850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, - "loss": 0.1009, + "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, - "routers_loss": 0.005943270865827799, + "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 @@ -5869,13 +5869,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.38671875, + "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, - "loss": 0.0941, + "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, - "routers_loss": 0.21597740054130554, + "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 @@ -5888,13 +5888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, - "loss": 0.0896, + "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, - "routers_loss": 0.023726588115096092, + "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 @@ -5907,13 +5907,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, - "routers_loss": 0.08467255532741547, + "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 @@ -5926,13 +5926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, - "loss": 0.0816, + "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, - "routers_loss": 0.029468854889273643, + "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 @@ -5945,13 +5945,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, - "loss": 0.0891, + "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, - "routers_loss": 0.09438466280698776, + "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 @@ -5964,13 +5964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, - "loss": 0.0581, + "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, - "routers_loss": 0.013679586350917816, + "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 @@ -5983,13 +5983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, - "loss": 0.0528, + "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, - "routers_loss": 0.029532987624406815, + "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 @@ -6002,13 +6002,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, - "loss": 0.0601, + "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, - "routers_loss": 0.05516733601689339, + "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 @@ -6021,13 +6021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, - "loss": 0.0785, + "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, - "routers_loss": 0.010254866443574429, + "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 @@ -6040,13 +6040,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, - "loss": 0.0518, + "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, - "routers_loss": 0.07528360933065414, + "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 @@ -6059,13 +6059,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1064453125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, - "loss": 0.0844, + "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, - "routers_loss": 0.04301584139466286, + "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 @@ -6078,13 +6078,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, - "loss": 0.0699, + "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, - "routers_loss": 0.14521080255508423, + "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 @@ -6097,13 +6097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, - "loss": 0.0543, + "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, - "routers_loss": 0.01074797473847866, + "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 @@ -6116,13 +6116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, - "loss": 0.0659, + "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, - "routers_loss": 0.009271817281842232, + "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 @@ -6135,13 +6135,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, - "loss": 0.0737, + "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, - "routers_loss": 0.10257050395011902, + "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 @@ -6154,13 +6154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, - "loss": 0.0363, + "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, - "routers_loss": 0.07091924548149109, + "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 @@ -6173,13 +6173,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, - "loss": 0.0421, + "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, - "routers_loss": 0.034446243196725845, + "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 @@ -6192,13 +6192,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, - "loss": 0.0593, + "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, - "routers_loss": 0.06077485531568527, + "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 @@ -6211,13 +6211,13 @@ "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2294921875, + "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, - "loss": 0.0537, + "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, - "routers_loss": 0.2382282167673111, + "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 @@ -6230,13 +6230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, - "loss": 0.0613, + "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, - "routers_loss": 0.011971636675298214, + "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 @@ -6249,13 +6249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.212890625, + "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, - "loss": 0.0414, + "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, - "routers_loss": 0.010221127420663834, + "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 @@ -6268,13 +6268,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2265625, + "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, - "loss": 0.061, + "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, - "routers_loss": 0.11860335618257523, + "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 @@ -6287,13 +6287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, - "loss": 0.0485, + "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, - "routers_loss": 0.011139829643070698, + "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 @@ -6306,13 +6306,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, - "loss": 0.0478, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, - "routers_loss": 0.03978770971298218, + "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 @@ -6327,11 +6327,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, - "loss": 0.0481, + "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, - "routers_loss": 0.051231011748313904, + "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 @@ -6344,13 +6344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, - "loss": 0.0615, + "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, - "routers_loss": 0.024917088449001312, + "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 @@ -6363,13 +6363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, - "loss": 0.0627, + "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, - "routers_loss": 0.008834881708025932, + "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 @@ -6382,13 +6382,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, - "routers_loss": 0.033405229449272156, + "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 @@ -6401,13 +6401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, - "loss": 0.0523, + "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, - "routers_loss": 0.020753704011440277, + "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 @@ -6420,13 +6420,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, - "loss": 0.0698, + "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, - "routers_loss": 0.06932353973388672, + "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 @@ -6439,13 +6439,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, - "loss": 0.059, + "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, - "routers_loss": 0.032903749495744705, + "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 @@ -6458,32 +6458,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.2099609375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, - "loss": 0.0417, + "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, - "routers_loss": 0.19733747839927673, + "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.2019371881420606, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.154296875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, - "loss": 0.0729, - "macro_f1": 0.6666666865348816, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, - "routers_loss": 0.013452666811645031, + "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 @@ -6496,13 +6496,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, - "routers_loss": 0.05302857980132103, + "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 @@ -6515,13 +6515,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, - "loss": 0.0527, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, - "routers_loss": 0.08124994486570358, + "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 @@ -6534,13 +6534,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, - "loss": 0.0579, + "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, - "routers_loss": 0.058633625507354736, + "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 @@ -6553,13 +6553,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, - "loss": 0.0533, + "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, - "routers_loss": 0.04703643172979355, + "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 @@ -6572,13 +6572,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, - "loss": 0.0485, + "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, - "routers_loss": 0.11615128815174103, + "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 @@ -6591,13 +6591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, - "loss": 0.0718, + "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, - "routers_loss": 0.017403846606612206, + "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 @@ -6610,13 +6610,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, - "loss": 0.0444, + "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, - "routers_loss": 0.07067303359508514, + "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 @@ -6629,13 +6629,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, - "loss": 0.0527, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, - "routers_loss": 0.07131028175354004, + "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 @@ -6648,13 +6648,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, - "loss": 0.0629, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, - "routers_loss": 0.1152748316526413, + "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 @@ -6667,13 +6667,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, - "loss": 0.0663, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, - "routers_loss": 0.0077212234027683735, + "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 @@ -6686,13 +6686,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.099609375, + "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, - "loss": 0.0494, + "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, - "routers_loss": 0.02726336568593979, + "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 @@ -6705,13 +6705,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.32421875, + "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, - "loss": 0.0615, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, - "routers_loss": 0.0958542674779892, + "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 @@ -6724,13 +6724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, - "loss": 0.0583, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, - "routers_loss": 0.007100600749254227, + "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 @@ -6743,13 +6743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, - "loss": 0.0445, + "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, - "routers_loss": 0.0047812811098992825, + "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 @@ -6762,13 +6762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, - "loss": 0.052, + "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, - "routers_loss": 0.006643407512456179, + "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 @@ -6781,13 +6781,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2890625, + "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, - "loss": 0.0719, + "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, - "routers_loss": 0.0910436138510704, + "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 @@ -6800,13 +6800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, - "loss": 0.0649, + "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, - "routers_loss": 0.010978946462273598, + "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 @@ -6819,13 +6819,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, - "loss": 0.0543, + "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, - "routers_loss": 0.009956461377441883, + "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 @@ -6838,13 +6838,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, - "loss": 0.0412, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, - "routers_loss": 0.057210199534893036, + "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 @@ -6857,32 +6857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, - "loss": 0.0364, + "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, - "routers_loss": 0.01035996899008751, + "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.3991781626063986, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, - "loss": 0.0661, - "macro_f1": 0.3076923191547394, + "loss": 0.063, + "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, - "routers_loss": 0.18087820708751678, + "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 @@ -6895,32 +6895,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, - "loss": 0.0505, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, - "routers_loss": 0.04720238968729973, + "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.417963017317288, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.2216796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, - "loss": 0.0603, - "macro_f1": 0.6666666865348816, + "loss": 0.06, + "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, - "routers_loss": 0.015407778322696686, + "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 @@ -6933,13 +6933,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, - "loss": 0.0489, + "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, - "routers_loss": 0.060891781002283096, + "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 @@ -6947,18 +6947,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.436747872028177, - "f1_execute": 0.943396270275116, + "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, - "loss": 0.0825, - "macro_f1": 0.3144654333591461, + "loss": 0.0796, + "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, - "routers_loss": 0.1661442220211029, + "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 @@ -6966,18 +6966,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.446140299383622, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, - "loss": 0.0634, - "macro_f1": 0.3333333432674408, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, - "routers_loss": 0.02108248695731163, + "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 @@ -6990,13 +6990,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, - "loss": 0.0534, + "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, - "routers_loss": 0.08318125456571579, + "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 @@ -7009,13 +7009,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, - "loss": 0.0514, + "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, - "routers_loss": 0.015889234840869904, + "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 @@ -7028,13 +7028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, - "routers_loss": 0.01378484908491373, + "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 @@ -7047,13 +7047,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, - "loss": 0.076, + "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, - "routers_loss": 0.02673683874309063, + "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 @@ -7066,13 +7066,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, - "loss": 0.0778, + "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, - "routers_loss": 0.27776041626930237, + "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 @@ -7085,13 +7085,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, - "loss": 0.0575, + "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, - "routers_loss": 0.0575483962893486, + "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 @@ -7104,13 +7104,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, - "loss": 0.0478, + "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, - "routers_loss": 0.0458797849714756, + "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 @@ -7123,13 +7123,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, - "loss": 0.0701, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, - "routers_loss": 0.07850238680839539, + "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 @@ -7142,13 +7142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, - "loss": 0.0702, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, - "routers_loss": 0.009507967159152031, + "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 @@ -7161,13 +7161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, - "loss": 0.0543, + "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, - "routers_loss": 0.02620997279882431, + "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 @@ -7180,13 +7180,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, - "loss": 0.0791, + "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, - "routers_loss": 0.02798631228506565, + "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 @@ -7201,11 +7201,11 @@ "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, - "loss": 0.046, + "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, - "routers_loss": 0.16614431142807007, + "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 @@ -7218,13 +7218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.193359375, + "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, - "loss": 0.0669, + "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, - "routers_loss": 0.008541896007955074, + "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 @@ -7237,13 +7237,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, - "loss": 0.0478, + "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, - "routers_loss": 0.045411624014377594, + "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 @@ -7256,13 +7256,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.380859375, + "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, - "loss": 0.0689, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, - "routers_loss": 0.052299100905656815, + "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 @@ -7275,13 +7275,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, - "loss": 0.0602, + "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, - "routers_loss": 0.09140212833881378, + "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 @@ -7294,13 +7294,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.201171875, + "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, - "loss": 0.0475, + "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, - "routers_loss": 0.030750583857297897, + "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 @@ -7313,13 +7313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, - "loss": 0.052, + "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, - "routers_loss": 0.010202950797975063, + "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 @@ -7332,13 +7332,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, - "loss": 0.0434, + "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, - "routers_loss": 0.07364192605018616, + "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 @@ -7353,11 +7353,11 @@ "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, - "loss": 0.0606, + "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, - "routers_loss": 0.06553081423044205, + "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 @@ -7370,13 +7370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, - "loss": 0.0475, + "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, - "routers_loss": 0.008751659654080868, + "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 @@ -7389,13 +7389,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.12060546875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, - "loss": 0.0777, + "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, - "routers_loss": 0.24522721767425537, + "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 @@ -7408,13 +7408,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, - "routers_loss": 0.03767623379826546, + "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 @@ -7427,13 +7427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, - "loss": 0.0708, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, - "routers_loss": 0.006098059006035328, + "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 @@ -7446,13 +7446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, - "loss": 0.0589, + "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, - "routers_loss": 0.01731623336672783, + "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 @@ -7465,13 +7465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, - "loss": 0.0526, + "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, - "routers_loss": 0.0076471962966024876, + "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 @@ -7484,13 +7484,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, - "loss": 0.0745, + "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, - "routers_loss": 0.0637628585100174, + "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 @@ -7503,13 +7503,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, - "loss": 0.0699, + "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, - "routers_loss": 0.10882514715194702, + "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 @@ -7522,13 +7522,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, - "loss": 0.056, + "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, - "routers_loss": 0.10924118757247925, + "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 @@ -7541,13 +7541,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, - "loss": 0.0664, + "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, - "routers_loss": 0.06571807712316513, + "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 @@ -7560,13 +7560,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, - "loss": 0.0616, + "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, - "routers_loss": 0.058966971933841705, + "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 @@ -7579,32 +7579,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, - "loss": 0.067, + "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, - "routers_loss": 0.04844636470079422, + "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, - "acc_skip": 0.4000000059604645, - "avg_layers": 26.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, "epoch": 3.756090402113296, - "f1_execute": 0.9166666865348816, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.5714285969734192, - "grad_norm": 0.1416015625, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, - "loss": 0.0634, - "macro_f1": 0.4960317611694336, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, - "routers_loss": 0.1591777801513672, + "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 @@ -7617,13 +7617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, - "loss": 0.0694, + "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, - "routers_loss": 0.017735568806529045, + "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 @@ -7636,13 +7636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, - "loss": 0.0477, + "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, - "routers_loss": 0.012401525862514973, + "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 @@ -7655,13 +7655,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, - "loss": 0.0735, + "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, - "routers_loss": 0.10736164450645447, + "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 @@ -7674,13 +7674,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, - "loss": 0.0428, + "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, - "routers_loss": 0.0595436617732048, + "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 @@ -7693,13 +7693,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.240234375, + "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, - "loss": 0.0494, + "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, - "routers_loss": 0.12617000937461853, + "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 @@ -7712,13 +7712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, - "loss": 0.0537, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, - "routers_loss": 0.021242506802082062, + "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 @@ -7731,13 +7731,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, - "loss": 0.0617, + "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, - "routers_loss": 0.10387415438890457, + "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 @@ -7750,13 +7750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, - "loss": 0.0565, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, - "routers_loss": 0.007816939614713192, + "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 @@ -7769,13 +7769,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, - "loss": 0.0654, + "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, - "routers_loss": 0.22526368498802185, + "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 @@ -7788,13 +7788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, - "loss": 0.056, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, - "routers_loss": 0.010998851619660854, + "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 @@ -7807,13 +7807,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, - "loss": 0.0712, + "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, - "routers_loss": 0.07115054875612259, + "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 @@ -7826,13 +7826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, - "loss": 0.0611, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, - "routers_loss": 0.008062695153057575, + "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 @@ -7845,13 +7845,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, - "loss": 0.0496, + "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, - "routers_loss": 0.16666285693645477, + "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 @@ -7864,13 +7864,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, - "loss": 0.0567, + "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, - "routers_loss": 0.0908689796924591, + "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 @@ -7883,13 +7883,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, - "loss": 0.0648, + "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, - "routers_loss": 0.04364728182554245, + "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 @@ -7897,18 +7897,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.906369239800411, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, - "loss": 0.0772, - "macro_f1": 0.3076923191547394, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, "num_tokens": 1344232.0, "repeat_count": 1.0, - "routers_loss": 0.15315109491348267, + "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 @@ -7921,13 +7921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, - "loss": 0.0527, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, - "routers_loss": 0.01342768594622612, + "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 @@ -7940,13 +7940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, - "loss": 0.0513, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, - "routers_loss": 0.01158806961029768, + "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 @@ -7959,13 +7959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, - "loss": 0.0549, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, - "routers_loss": 0.01255605649203062, + "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 @@ -7978,13 +7978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, - "loss": 0.0578, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, - "routers_loss": 0.010225459933280945, + "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 @@ -7997,13 +7997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, - "loss": 0.0633, + "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, - "routers_loss": 0.007837744429707527, + "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 @@ -8016,13 +8016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, - "loss": 0.0674, + "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, - "routers_loss": 0.008631376549601555, + "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 @@ -8035,13 +8035,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, - "loss": 0.0612, + "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, - "routers_loss": 0.06206027418375015, + "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 @@ -8049,18 +8049,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 3.9815086586439685, - "f1_execute": 0.9411765336990356, + "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.16015625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, - "loss": 0.0678, - "macro_f1": 0.480392187833786, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, - "routers_loss": 0.1463794708251953, + "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 @@ -8073,13 +8073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, - "loss": 0.052, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, - "routers_loss": 0.01140492781996727, + "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 @@ -8092,13 +8092,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, - "loss": 0.0563, + "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, - "routers_loss": 0.05136030167341232, + "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 @@ -8111,32 +8111,32 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, - "loss": 0.0627, + "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, - "routers_loss": 0.07274381071329117, + "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 4.01878485471089, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, - "loss": 0.0503, - "macro_f1": 0.3272727429866791, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, - "routers_loss": 0.024335317313671112, + "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 @@ -8149,13 +8149,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, - "loss": 0.0359, + "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, - "routers_loss": 0.046614740043878555, + "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 @@ -8168,13 +8168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, - "loss": 0.0372, + "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, - "routers_loss": 0.006380240898579359, + "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 @@ -8187,13 +8187,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, - "loss": 0.0473, + "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, - "routers_loss": 0.04770716652274132, + "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 @@ -8206,32 +8206,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, - "loss": 0.0434, + "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, - "routers_loss": 0.006983708590269089, + "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.065746991488113, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.080078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, - "loss": 0.0476, - "macro_f1": 0.32098764181137085, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, - "routers_loss": 0.046313900500535965, + "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 @@ -8244,32 +8244,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, - "routers_loss": 0.0401870422065258, + "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.084531846199002, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, - "loss": 0.0505, - "macro_f1": 0.32098764181137085, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, - "routers_loss": 0.03313451260328293, + "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 @@ -8282,13 +8282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, - "loss": 0.0468, + "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, - "routers_loss": 0.010118982754647732, + "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 @@ -8301,13 +8301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, - "loss": 0.0498, + "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, - "routers_loss": 0.005856200121343136, + "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 @@ -8320,13 +8320,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, - "loss": 0.0417, + "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, - "routers_loss": 0.02768322452902794, + "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 @@ -8339,13 +8339,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, - "loss": 0.0419, + "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, - "routers_loss": 0.09382032603025436, + "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 @@ -8358,32 +8358,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.103515625, + "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, - "loss": 0.0466, + "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, - "routers_loss": 0.026867631822824478, + "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 4.14088641033167, - "f1_execute": 0.95652174949646, + "f1_execute": 0.936170220375061, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.26171875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, - "loss": 0.0496, - "macro_f1": 0.5855072736740112, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, - "routers_loss": 0.12731307744979858, + "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 @@ -8396,13 +8396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, - "loss": 0.039, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, - "routers_loss": 0.008483970537781715, + "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 @@ -8415,13 +8415,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, - "loss": 0.0348, + "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, - "routers_loss": 0.07847871631383896, + "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 @@ -8434,13 +8434,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, - "loss": 0.036, + "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, - "routers_loss": 0.04574459046125412, + "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 @@ -8453,13 +8453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, - "loss": 0.0485, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, - "routers_loss": 0.004683624487370253, + "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 @@ -8472,13 +8472,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, - "loss": 0.0476, + "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, - "routers_loss": 0.06499828398227692, + "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 @@ -8486,18 +8486,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 4.197240974464338, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, - "loss": 0.0326, - "macro_f1": 0.44705885648727417, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, - "routers_loss": 0.08285653591156006, + "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 @@ -8510,13 +8510,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, - "loss": 0.0497, + "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, - "routers_loss": 0.04252336546778679, + "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 @@ -8524,18 +8524,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.216025829175227, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, - "loss": 0.0349, - "macro_f1": 0.31446540355682373, + "loss": 0.034, + "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, - "routers_loss": 0.126711905002594, + "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 @@ -8548,13 +8548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, - "loss": 0.0392, + "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, - "routers_loss": 0.00955706462264061, + "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 @@ -8567,13 +8567,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, - "loss": 0.0377, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, - "routers_loss": 0.03127318620681763, + "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 @@ -8586,13 +8586,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, - "loss": 0.0389, + "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, - "routers_loss": 0.06743519753217697, + "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 @@ -8605,13 +8605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, - "loss": 0.0477, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, - "routers_loss": 0.0025313226506114006, + "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 @@ -8624,13 +8624,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, - "loss": 0.052, + "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, - "routers_loss": 0.037147488445043564, + "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 @@ -8643,13 +8643,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, - "loss": 0.0501, + "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, - "routers_loss": 0.021232586354017258, + "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 @@ -8662,13 +8662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, - "loss": 0.0759, + "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, - "routers_loss": 0.010563847608864307, + "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 @@ -8681,13 +8681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, - "loss": 0.0425, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, - "routers_loss": 0.0267612524330616, + "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 @@ -8700,13 +8700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, - "loss": 0.0501, + "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, - "routers_loss": 0.005838244222104549, + "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 @@ -8719,13 +8719,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, - "loss": 0.0383, + "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, - "routers_loss": 0.014642171561717987, + "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 @@ -8738,32 +8738,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, - "loss": 0.0457, + "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, - "routers_loss": 0.04119620472192764, + "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.328734957440563, - "f1_execute": 0.943396270275116, - "f1_repeat": 0.0, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, - "loss": 0.0433, - "macro_f1": 0.3144654333591461, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, - "routers_loss": 0.06713195145130157, + "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 @@ -8776,13 +8776,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1572265625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, - "loss": 0.0533, + "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, - "routers_loss": 0.024023180827498436, + "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 @@ -8795,13 +8795,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, - "loss": 0.0373, + "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, - "routers_loss": 0.05399841442704201, + "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 @@ -8814,13 +8814,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, - "loss": 0.0488, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, - "routers_loss": 0.0299264844506979, + "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 @@ -8833,13 +8833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, - "loss": 0.0467, + "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, - "routers_loss": 0.023478010669350624, + "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 @@ -8852,13 +8852,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.1103515625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, - "loss": 0.0447, + "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, - "routers_loss": 0.12116194516420364, + "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 @@ -8871,13 +8871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, - "loss": 0.053, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, - "routers_loss": 0.011879723519086838, + "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 @@ -8890,13 +8890,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, - "loss": 0.0373, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, - "routers_loss": 0.009245929308235645, + "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 @@ -8909,13 +8909,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, - "loss": 0.0461, + "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, - "routers_loss": 0.032464127987623215, + "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 @@ -8928,13 +8928,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, - "loss": 0.0515, + "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, - "routers_loss": 0.08835392445325851, + "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 @@ -8947,13 +8947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, - "loss": 0.0405, + "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, - "routers_loss": 0.014307246543467045, + "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 @@ -8966,13 +8966,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, - "loss": 0.0449, + "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, - "routers_loss": 0.10261563211679459, + "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 @@ -8985,13 +8985,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, - "loss": 0.0507, + "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, - "routers_loss": 0.03316422924399376, + "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 @@ -9004,13 +9004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, - "loss": 0.0352, + "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, - "routers_loss": 0.00902469176799059, + "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 @@ -9023,13 +9023,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, - "loss": 0.0581, + "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, - "routers_loss": 0.06710167229175568, + "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 @@ -9042,13 +9042,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, - "loss": 0.0294, + "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, - "routers_loss": 0.04208769276738167, + "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 @@ -9061,13 +9061,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, - "loss": 0.0505, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, - "routers_loss": 0.06530380249023438, + "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 @@ -9080,13 +9080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, - "loss": 0.0545, + "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, - "routers_loss": 0.01803453080356121, + "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 @@ -9099,13 +9099,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, - "loss": 0.0481, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, - "routers_loss": 0.08461762219667435, + "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 @@ -9120,11 +9120,11 @@ "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, - "loss": 0.0441, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, - "routers_loss": 0.007111486047506332, + "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 @@ -9137,13 +9137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11181640625, + "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, - "loss": 0.0361, + "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, - "routers_loss": 0.029776185750961304, + "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 @@ -9156,13 +9156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, - "loss": 0.0506, + "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, - "routers_loss": 0.007108999416232109, + "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 @@ -9175,13 +9175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, - "loss": 0.0498, + "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, - "routers_loss": 0.01126947533339262, + "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 @@ -9194,13 +9194,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, - "loss": 0.0366, + "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, - "routers_loss": 0.05142999067902565, + "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 @@ -9213,13 +9213,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, - "loss": 0.0461, + "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, - "routers_loss": 0.010770819149911404, + "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 @@ -9232,13 +9232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, - "loss": 0.0692, + "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, - "routers_loss": 0.008775795809924603, + "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 @@ -9251,13 +9251,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, - "loss": 0.0403, + "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, - "routers_loss": 0.05100395902991295, + "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 @@ -9270,13 +9270,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25390625, + "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, - "loss": 0.0503, + "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, - "routers_loss": 0.06653711199760437, + "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 @@ -9284,18 +9284,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.591722923393014, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, - "loss": 0.0435, - "macro_f1": 0.3333333432674408, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, - "routers_loss": 0.009865665808320045, + "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 @@ -9308,13 +9308,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.130859375, + "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, - "loss": 0.0399, + "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, - "routers_loss": 0.021175632253289223, + "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 @@ -9327,32 +9327,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, - "loss": 0.0472, + "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, - "routers_loss": 0.011803832836449146, + "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.800000011920929, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 4.6199002054593485, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.0, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, - "grad_norm": 0.142578125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, - "loss": 0.0467, - "macro_f1": 0.5696970224380493, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, - "routers_loss": 0.08856551349163055, + "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 @@ -9365,13 +9365,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, - "loss": 0.0413, + "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, - "routers_loss": 0.08593414723873138, + "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 @@ -9384,13 +9384,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, - "loss": 0.0399, + "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, - "routers_loss": 0.07049509882926941, + "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 @@ -9403,13 +9403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, - "loss": 0.0595, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, - "routers_loss": 0.0019258069805800915, + "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 @@ -9422,13 +9422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, - "loss": 0.0335, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, - "routers_loss": 0.014094089157879353, + "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 @@ -9436,18 +9436,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 31.0, "epoch": 4.666862342236572, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, - "loss": 0.0603, - "macro_f1": 0.6527777910232544, + "loss": 0.06, + "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, - "routers_loss": 0.06360147893428802, + "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 @@ -9460,13 +9460,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10546875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, - "loss": 0.0573, + "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, - "routers_loss": 0.0326208658516407, + "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 @@ -9479,13 +9479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, - "loss": 0.0502, + "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, - "routers_loss": 0.012660752050578594, + "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 @@ -9498,13 +9498,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, - "loss": 0.0537, + "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, - "routers_loss": 0.021756287664175034, + "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644 --- a/checkpoint-1000/training_args.bin +++ b/checkpoint-1000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 size 5880 diff --git a/checkpoint-10000/chat_template.jinja b/checkpoint-10000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-10000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd --- /dev/null +++ b/checkpoint-10000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-10000/generation_config.json b/checkpoint-10000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35 --- /dev/null +++ b/checkpoint-10000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.55.2" +} diff --git a/checkpoint-10000/model-00001-of-00002.safetensors b/checkpoint-10000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284 --- /dev/null +++ b/checkpoint-10000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-10000/model-00002-of-00002.safetensors b/checkpoint-10000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..22312c108c4857773753d52c1f1a230315388e35 --- /dev/null +++ b/checkpoint-10000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b58dd61aa65becc607555e6f23c5942f6e74879d50af451a1fa3137e6aca6ea +size 1481790520 diff --git a/checkpoint-10000/model.safetensors.index.json b/checkpoint-10000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-10000/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-10000/optimizer.pt b/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ec455b471f1a42016d92c425d6a270ff218ceea --- /dev/null +++ b/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:840250979ca530333499796a7bf10f85f6e6db757e225d1fa8ebc9adffb26459 +size 44191162 diff --git a/checkpoint-10000/rng_state.pth b/checkpoint-10000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec62bcc28f4b60e6182ac81ef0e159a2cf3e7183 --- /dev/null +++ b/checkpoint-10000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec9f38518272c3edcef2f4b76e7cf3ba41857ee958849b9cec81d28afbeefdc +size 14244 diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7b2696f98af8e73f5f371279422f32926e8d228 --- /dev/null +++ b/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f553e44a7bafda5633bddec9889354deb82b5be5f750b2e4c9bae414b2b61fd3 +size 1064 diff --git a/checkpoint-10000/special_tokens_map.json b/checkpoint-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-10000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-10000/tokenizer.json b/checkpoint-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-10000/tokenizer_config.json b/checkpoint-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-10000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-10000/trainer_state.json b/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ceed09df682bd89d0877b30a2e15e27091dbc1d8 --- /dev/null +++ b/checkpoint-10000/trainer_state.json @@ -0,0 +1,95034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 46.94863516289991, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 28.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00043426384465025604, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9682677.0, + "repeat_count": 2.0, + "routers_loss": 0.0011284075444564223, + "skip_count": 0.0, + "step": 6002, + "text_loss": 0.28305181860923767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.000433957027399272, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9685310.0, + "repeat_count": 0.0, + "routers_loss": 0.0030473743099719286, + "skip_count": 1.0, + "step": 6004, + "text_loss": 0.3650054931640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00043365023545607965, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9687944.0, + "repeat_count": 1.0, + "routers_loss": 0.011621905490756035, + "skip_count": 2.0, + "step": 6006, + "text_loss": 0.5409000515937805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004333434689382423, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 9690932.0, + "repeat_count": 0.0, + "routers_loss": 0.0005297541501931846, + "skip_count": 0.0, + "step": 6008, + "text_loss": 0.4311029314994812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.216025829175226, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00043303672796331336, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9693972.0, + "repeat_count": 1.0, + "routers_loss": 0.06166421249508858, + "skip_count": 0.0, + "step": 6010, + "text_loss": 0.2658997178077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00043273001264883655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9697712.0, + "repeat_count": 0.0, + "routers_loss": 0.0018419031985104084, + "skip_count": 0.0, + "step": 6012, + "text_loss": 0.5813497304916382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004324233231123458, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9700746.0, + "repeat_count": 0.0, + "routers_loss": 0.003635555040091276, + "skip_count": 0.0, + "step": 6014, + "text_loss": 0.24211904406547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 28.24420311124156, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004321166594713651, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 9704087.0, + "repeat_count": 0.0, + "routers_loss": 0.021067705005407333, + "skip_count": 2.0, + "step": 6016, + "text_loss": 0.5908042788505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00043181002184340857, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9708695.0, + "repeat_count": 0.0, + "routers_loss": 0.0008712753187865019, + "skip_count": 0.0, + "step": 6018, + "text_loss": 0.7788549661636353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004315034103459803, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 9711631.0, + "repeat_count": 1.0, + "routers_loss": 0.03231092542409897, + "skip_count": 0.0, + "step": 6020, + "text_loss": 0.6127741932868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004311968250965743, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9715526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020149527117609978, + "skip_count": 2.0, + "step": 6022, + "text_loss": 0.49970078468322754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004308902662126748, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9718475.0, + "repeat_count": 0.0, + "routers_loss": 0.0031795913819223642, + "skip_count": 0.0, + "step": 6024, + "text_loss": 0.3254713714122772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00043058373381175567, + "loss": 0.004, + "macro_f1": 0.3272727429866791, + "num_tokens": 9722194.0, + "repeat_count": 0.0, + "routers_loss": 0.0148378387093544, + "skip_count": 1.0, + "step": 6026, + "text_loss": 0.17670343816280365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0004302772280112806, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 9725489.0, + "repeat_count": 1.0, + "routers_loss": 0.005742347799241543, + "skip_count": 2.0, + "step": 6028, + "text_loss": 0.26184776425361633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00042997074892870335, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9729416.0, + "repeat_count": 0.0, + "routers_loss": 0.0023561837151646614, + "skip_count": 0.0, + "step": 6030, + "text_loss": 0.3026008605957031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0004296642966814673, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9732559.0, + "repeat_count": 0.0, + "routers_loss": 0.0010108393616974354, + "skip_count": 1.0, + "step": 6032, + "text_loss": 0.43198078870773315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00042935787138700525, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 9736324.0, + "repeat_count": 2.0, + "routers_loss": 0.005443581845611334, + "skip_count": 2.0, + "step": 6034, + "text_loss": 0.24883155524730682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0004290514731627403, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 9739630.0, + "repeat_count": 1.0, + "routers_loss": 0.010645060800015926, + "skip_count": 2.0, + "step": 6036, + "text_loss": 0.24207182228565216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.0004287451021260846, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9742221.0, + "repeat_count": 0.0, + "routers_loss": 0.0008162845042534173, + "skip_count": 0.0, + "step": 6038, + "text_loss": 0.33018553256988525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004284387583944403, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9744925.0, + "repeat_count": 0.0, + "routers_loss": 0.003782407147809863, + "skip_count": 1.0, + "step": 6040, + "text_loss": 0.6600399613380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0004281324420851987, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9748103.0, + "repeat_count": 0.0, + "routers_loss": 0.0009834285592660308, + "skip_count": 0.0, + "step": 6042, + "text_loss": 0.6402350664138794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0004278261533157409, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9751128.0, + "repeat_count": 0.0, + "routers_loss": 0.004100334830582142, + "skip_count": 2.0, + "step": 6044, + "text_loss": 0.1545136719942093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0004275198922034372, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 9754140.0, + "repeat_count": 0.0, + "routers_loss": 0.0017166603356599808, + "skip_count": 1.0, + "step": 6046, + "text_loss": 0.5875935554504395 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042721365886564766, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9756945.0, + "repeat_count": 1.0, + "routers_loss": 0.00915827602148056, + "skip_count": 2.0, + "step": 6048, + "text_loss": 0.3885214328765869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00042690745341972134, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9759738.0, + "repeat_count": 0.0, + "routers_loss": 0.0057020667009055614, + "skip_count": 2.0, + "step": 6050, + "text_loss": 0.3107164204120636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00042660127598299647, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9762987.0, + "repeat_count": 0.0, + "routers_loss": 0.004196313209831715, + "skip_count": 2.0, + "step": 6052, + "text_loss": 0.3073577582836151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00042629512667280135, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 9765828.0, + "repeat_count": 0.0, + "routers_loss": 0.0023119752295315266, + "skip_count": 1.0, + "step": 6054, + "text_loss": 0.8228643536567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004259890056064527, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 9769129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021007524337619543, + "skip_count": 1.0, + "step": 6056, + "text_loss": 0.8334706425666809 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004256829129012568, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9771821.0, + "repeat_count": 1.0, + "routers_loss": 0.00671970471739769, + "skip_count": 2.0, + "step": 6058, + "text_loss": 0.17845536768436432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00042537684867450875, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9774566.0, + "repeat_count": 0.0, + "routers_loss": 0.0014770646812394261, + "skip_count": 0.0, + "step": 6060, + "text_loss": 0.4445459246635437 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.46022894041679, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00042507081304349315, + "loss": 0.0067, + "macro_f1": 0.5492662787437439, + "num_tokens": 9777909.0, + "repeat_count": 2.0, + "routers_loss": 0.014822427183389664, + "skip_count": 0.0, + "step": 6062, + "text_loss": 0.45526158809661865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004247648061254833, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9781159.0, + "repeat_count": 0.0, + "routers_loss": 0.00568385748192668, + "skip_count": 1.0, + "step": 6064, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.479013795127678, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00042445882803774173, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 9784960.0, + "repeat_count": 1.0, + "routers_loss": 0.0179694052785635, + "skip_count": 0.0, + "step": 6066, + "text_loss": 0.23591181635856628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00042415287889751966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9787941.0, + "repeat_count": 0.0, + "routers_loss": 0.0019039154285565019, + "skip_count": 0.0, + "step": 6068, + "text_loss": 0.9447930455207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004238469588220575, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9791096.0, + "repeat_count": 0.0, + "routers_loss": 0.004039563238620758, + "skip_count": 0.0, + "step": 6070, + "text_loss": 0.3134256601333618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00042354106792858446, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9794082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018352365586906672, + "skip_count": 0.0, + "step": 6072, + "text_loss": 0.5681536197662354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00042323520633431833, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9797303.0, + "repeat_count": 0.0, + "routers_loss": 0.0019325513858348131, + "skip_count": 0.0, + "step": 6074, + "text_loss": 0.2835809290409088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00042292937415646574, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9800435.0, + "repeat_count": 0.0, + "routers_loss": 0.002513401210308075, + "skip_count": 0.0, + "step": 6076, + "text_loss": 0.1931663602590561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00042262357151222265, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9803873.0, + "repeat_count": 0.0, + "routers_loss": 0.004864581860601902, + "skip_count": 0.0, + "step": 6078, + "text_loss": 0.25809767842292786 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004223177985187728, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9806438.0, + "repeat_count": 1.0, + "routers_loss": 0.004932792857289314, + "skip_count": 0.0, + "step": 6080, + "text_loss": 0.6409249305725098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00042201205529328925, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9809400.0, + "repeat_count": 0.0, + "routers_loss": 0.00590938376262784, + "skip_count": 1.0, + "step": 6082, + "text_loss": 0.31158050894737244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00042170634195293314, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9813246.0, + "repeat_count": 0.0, + "routers_loss": 0.006805860437452793, + "skip_count": 0.0, + "step": 6084, + "text_loss": 0.32945963740348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004214006586148545, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9816513.0, + "repeat_count": 0.0, + "routers_loss": 0.0010186503641307354, + "skip_count": 0.0, + "step": 6086, + "text_loss": 0.48659923672676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0004210950053961917, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9819908.0, + "repeat_count": 0.0, + "routers_loss": 0.00402973173186183, + "skip_count": 1.0, + "step": 6088, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00042078938241407174, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9822950.0, + "repeat_count": 0.0, + "routers_loss": 0.00236532068811357, + "skip_count": 1.0, + "step": 6090, + "text_loss": 0.26589256525039673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004204837897856098, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9826493.0, + "repeat_count": 1.0, + "routers_loss": 0.003072192659601569, + "skip_count": 2.0, + "step": 6092, + "text_loss": 0.5216912627220154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004201782276279096, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9829698.0, + "repeat_count": 0.0, + "routers_loss": 0.0027553171385079622, + "skip_count": 1.0, + "step": 6094, + "text_loss": 0.40127676725387573 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.61990020545935, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00041987269605806325, + "loss": 0.0045, + "macro_f1": 0.9442509412765503, + "num_tokens": 9833719.0, + "repeat_count": 4.0, + "routers_loss": 0.013845407404005527, + "skip_count": 4.0, + "step": 6096, + "text_loss": 0.23114071786403656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004195671951931509, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 9838235.0, + "repeat_count": 0.0, + "routers_loss": 0.0019887303933501244, + "skip_count": 2.0, + "step": 6098, + "text_loss": 0.7467341423034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004192617251502409, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9840867.0, + "repeat_count": 0.0, + "routers_loss": 0.0007213905337266624, + "skip_count": 0.0, + "step": 6100, + "text_loss": 0.6283472180366516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00041895628604639036, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9843827.0, + "repeat_count": 0.0, + "routers_loss": 0.003863139310851693, + "skip_count": 1.0, + "step": 6102, + "text_loss": 0.3602744936943054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00041865087799864374, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9846939.0, + "repeat_count": 0.0, + "routers_loss": 0.0013336286647245288, + "skip_count": 0.0, + "step": 6104, + "text_loss": 0.4182434678077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0004183455011240341, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 9849827.0, + "repeat_count": 0.0, + "routers_loss": 0.00038455065805464983, + "skip_count": 0.0, + "step": 6106, + "text_loss": 0.7122722864151001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 28.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004180401555395826, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 9853487.0, + "repeat_count": 3.0, + "routers_loss": 0.0038226440083235502, + "skip_count": 1.0, + "step": 6108, + "text_loss": 0.2521185576915741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004177348413622981, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9856321.0, + "repeat_count": 0.0, + "routers_loss": 0.0015809801407158375, + "skip_count": 0.0, + "step": 6110, + "text_loss": 0.423979252576828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004174295587091776, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9859238.0, + "repeat_count": 0.0, + "routers_loss": 0.0007586454739794135, + "skip_count": 0.0, + "step": 6112, + "text_loss": 0.4720100462436676 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00041712430769720593, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 9862282.0, + "repeat_count": 1.0, + "routers_loss": 0.0045816488564014435, + "skip_count": 1.0, + "step": 6114, + "text_loss": 0.279577374458313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004168190884433559, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 9865394.0, + "repeat_count": 1.0, + "routers_loss": 0.004728195257484913, + "skip_count": 1.0, + "step": 6116, + "text_loss": 0.3826395571231842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0004165139010645881, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9869165.0, + "repeat_count": 0.0, + "routers_loss": 0.006160226184874773, + "skip_count": 3.0, + "step": 6118, + "text_loss": 0.4668935537338257 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 28.732609333724685, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004162087456778509, + "loss": 0.0074, + "macro_f1": 0.9619450569152832, + "num_tokens": 9872381.0, + "repeat_count": 1.0, + "routers_loss": 0.027831824496388435, + "skip_count": 6.0, + "step": 6120, + "text_loss": 0.28708913922309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004159036224000804, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9875668.0, + "repeat_count": 0.0, + "routers_loss": 0.0030764432158321142, + "skip_count": 1.0, + "step": 6122, + "text_loss": 0.37078607082366943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004155985313482002, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9878533.0, + "repeat_count": 0.0, + "routers_loss": 0.00043521137558855116, + "skip_count": 0.0, + "step": 6124, + "text_loss": 0.34975379705429077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00041529347263912224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9881478.0, + "repeat_count": 0.0, + "routers_loss": 0.0016251741908490658, + "skip_count": 0.0, + "step": 6126, + "text_loss": 0.39166271686553955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00041498844638974535, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 9884252.0, + "repeat_count": 1.0, + "routers_loss": 0.019553523510694504, + "skip_count": 0.0, + "step": 6128, + "text_loss": 0.2309480905532837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004146834527169562, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9887485.0, + "repeat_count": 1.0, + "routers_loss": 0.0036251386627554893, + "skip_count": 0.0, + "step": 6130, + "text_loss": 0.4464457631111145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00041437849173762894, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9890711.0, + "repeat_count": 0.0, + "routers_loss": 0.0008515548543073237, + "skip_count": 0.0, + "step": 6132, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004140735635686251, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9894458.0, + "repeat_count": 1.0, + "routers_loss": 0.001084602321498096, + "skip_count": 0.0, + "step": 6134, + "text_loss": 0.32015663385391235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004137686683267938, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9897634.0, + "repeat_count": 0.0, + "routers_loss": 0.0025203595869243145, + "skip_count": 0.0, + "step": 6136, + "text_loss": 0.15804508328437805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0004134638061289715, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9901157.0, + "repeat_count": 0.0, + "routers_loss": 0.0029381231870502234, + "skip_count": 0.0, + "step": 6138, + "text_loss": 0.14375236630439758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0004131589770919819, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9903958.0, + "repeat_count": 0.0, + "routers_loss": 0.002789110178127885, + "skip_count": 0.0, + "step": 6140, + "text_loss": 0.2474033683538437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004128541813326361, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9906799.0, + "repeat_count": 2.0, + "routers_loss": 0.010770512744784355, + "skip_count": 3.0, + "step": 6142, + "text_loss": 0.2304249256849289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004125494189677325, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 9909286.0, + "repeat_count": 1.0, + "routers_loss": 0.003122122259810567, + "skip_count": 0.0, + "step": 6144, + "text_loss": 0.3781827688217163 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00041224469011405643, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9912416.0, + "repeat_count": 1.0, + "routers_loss": 0.008443298749625683, + "skip_count": 1.0, + "step": 6146, + "text_loss": 0.3004767596721649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004119399948883806, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9915290.0, + "repeat_count": 0.0, + "routers_loss": 0.0033219947945326567, + "skip_count": 1.0, + "step": 6148, + "text_loss": 0.748744547367096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0004116353334074647, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9918493.0, + "repeat_count": 1.0, + "routers_loss": 0.005501769948750734, + "skip_count": 0.0, + "step": 6150, + "text_loss": 0.330759733915329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.000411330705788056, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9921027.0, + "repeat_count": 0.0, + "routers_loss": 0.0013694261433556676, + "skip_count": 0.0, + "step": 6152, + "text_loss": 0.43070924282073975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.000411026112146888, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9924303.0, + "repeat_count": 0.0, + "routers_loss": 0.00046192589798010886, + "skip_count": 0.0, + "step": 6154, + "text_loss": 0.5674887895584106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004107215526006817, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9927065.0, + "repeat_count": 1.0, + "routers_loss": 0.004311304073780775, + "skip_count": 0.0, + "step": 6156, + "text_loss": 0.16138267517089844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004104170272661449, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9930713.0, + "repeat_count": 0.0, + "routers_loss": 0.0035845425445586443, + "skip_count": 0.0, + "step": 6158, + "text_loss": 0.18728356063365936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00041011253625997227, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9934393.0, + "repeat_count": 0.0, + "routers_loss": 0.00247366214171052, + "skip_count": 0.0, + "step": 6160, + "text_loss": 0.3624019920825958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004098080796988452, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9937457.0, + "repeat_count": 0.0, + "routers_loss": 0.003240241203457117, + "skip_count": 0.0, + "step": 6162, + "text_loss": 0.12348521500825882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0004095036576994321, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 9940523.0, + "repeat_count": 0.0, + "routers_loss": 0.001985874492675066, + "skip_count": 1.0, + "step": 6164, + "text_loss": 0.2688066363334656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00040919927037838815, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9943802.0, + "repeat_count": 0.0, + "routers_loss": 0.004264154937118292, + "skip_count": 3.0, + "step": 6166, + "text_loss": 0.49316367506980896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00040889491785235513, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9946649.0, + "repeat_count": 0.0, + "routers_loss": 0.002545441733673215, + "skip_count": 0.0, + "step": 6168, + "text_loss": 0.4079313576221466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004085906002379614, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9949800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009590961271896958, + "skip_count": 0.0, + "step": 6170, + "text_loss": 0.6166561245918274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004082863176518221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9954008.0, + "repeat_count": 0.0, + "routers_loss": 0.003795337164774537, + "skip_count": 2.0, + "step": 6172, + "text_loss": 0.4791361689567566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0004079820702105388, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9957153.0, + "repeat_count": 0.0, + "routers_loss": 0.0015634822193533182, + "skip_count": 0.0, + "step": 6174, + "text_loss": 0.7208777666091919 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.995597299677137, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004076778580306999, + "loss": 0.0056, + "macro_f1": 0.8820862174034119, + "num_tokens": 9960060.0, + "repeat_count": 2.0, + "routers_loss": 0.03223998099565506, + "skip_count": 2.0, + "step": 6176, + "text_loss": 0.6617992520332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00040737368122887983, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9963396.0, + "repeat_count": 0.0, + "routers_loss": 0.0033978577703237534, + "skip_count": 0.0, + "step": 6178, + "text_loss": 0.7339215278625488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00040706953992164, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9966364.0, + "repeat_count": 0.0, + "routers_loss": 0.0005358994239941239, + "skip_count": 0.0, + "step": 6180, + "text_loss": 0.44187214970588684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040676543422552767, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9969813.0, + "repeat_count": 0.0, + "routers_loss": 0.0018544091144576669, + "skip_count": 1.0, + "step": 6182, + "text_loss": 0.6244927048683167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004064613642570769, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9973015.0, + "repeat_count": 0.0, + "routers_loss": 0.005692692007869482, + "skip_count": 0.0, + "step": 6184, + "text_loss": 0.18860043585300446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00040615733013280784, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9976201.0, + "repeat_count": 0.0, + "routers_loss": 0.0018737476784735918, + "skip_count": 0.0, + "step": 6186, + "text_loss": 0.21189232170581818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00040585333196922687, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9979711.0, + "repeat_count": 0.0, + "routers_loss": 0.011945146135985851, + "skip_count": 2.0, + "step": 6188, + "text_loss": 0.2628154456615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00040554936988282663, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9983003.0, + "repeat_count": 0.0, + "routers_loss": 0.0036045778542757034, + "skip_count": 1.0, + "step": 6190, + "text_loss": 0.5926038026809692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0004052454439900861, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9986841.0, + "repeat_count": 0.0, + "routers_loss": 0.004170368425548077, + "skip_count": 0.0, + "step": 6192, + "text_loss": 0.3088737726211548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00040494155440747015, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9989596.0, + "repeat_count": 0.0, + "routers_loss": 0.002254750579595566, + "skip_count": 2.0, + "step": 6194, + "text_loss": 0.6309700012207031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.089228059876724, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00040463770125142987, + "loss": 0.0087, + "macro_f1": 0.8814815282821655, + "num_tokens": 9992789.0, + "repeat_count": 2.0, + "routers_loss": 0.04092822223901749, + "skip_count": 4.0, + "step": 6196, + "text_loss": 0.09625697880983353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00040433388463840213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 9995782.0, + "repeat_count": 0.0, + "routers_loss": 0.00029065192211419344, + "skip_count": 0.0, + "step": 6198, + "text_loss": 0.5600258111953735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004040301046848105, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9998712.0, + "repeat_count": 0.0, + "routers_loss": 0.0005865268758498132, + "skip_count": 0.0, + "step": 6200, + "text_loss": 0.6426429748535156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.11740534194306, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0283203125, + "learning_rate": 0.0004037263615070638, + "loss": 0.0078, + "macro_f1": 0.9265305995941162, + "num_tokens": 10002020.0, + "repeat_count": 1.0, + "routers_loss": 0.025357060134410858, + "skip_count": 3.0, + "step": 6202, + "text_loss": 0.25125735998153687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000403422655221557, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10005381.0, + "repeat_count": 0.0, + "routers_loss": 0.003139561740681529, + "skip_count": 1.0, + "step": 6204, + "text_loss": 0.3639419376850128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00040311898594467085, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10008348.0, + "repeat_count": 0.0, + "routers_loss": 0.004091196693480015, + "skip_count": 2.0, + "step": 6206, + "text_loss": 0.1602363884449005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040281535379277204, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10011171.0, + "repeat_count": 0.0, + "routers_loss": 0.005771483760327101, + "skip_count": 0.0, + "step": 6208, + "text_loss": 0.5593504905700684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000402511758882213, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10014374.0, + "repeat_count": 0.0, + "routers_loss": 0.005212264601141214, + "skip_count": 1.0, + "step": 6210, + "text_loss": 0.15668229758739471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004022082013293319, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10017327.0, + "repeat_count": 0.0, + "routers_loss": 0.0027585842180997133, + "skip_count": 1.0, + "step": 6212, + "text_loss": 0.21188466250896454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.173759906075727, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00040190468125045255, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10020518.0, + "repeat_count": 0.0, + "routers_loss": 0.013210589066147804, + "skip_count": 1.0, + "step": 6214, + "text_loss": 0.2551073729991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00040160119876188436, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10023799.0, + "repeat_count": 1.0, + "routers_loss": 0.001590219559147954, + "skip_count": 0.0, + "step": 6216, + "text_loss": 0.5634782314300537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004012977539799224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 10027107.0, + "repeat_count": 0.0, + "routers_loss": 0.003917343448847532, + "skip_count": 0.0, + "step": 6218, + "text_loss": 0.6412819027900696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004009943470208473, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 10030460.0, + "repeat_count": 0.0, + "routers_loss": 0.00874288845807314, + "skip_count": 2.0, + "step": 6220, + "text_loss": 0.13269923627376556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.000400690978000925, + "loss": 0.0075, + "macro_f1": 0.8817967176437378, + "num_tokens": 10034086.0, + "repeat_count": 2.0, + "routers_loss": 0.03736349940299988, + "skip_count": 3.0, + "step": 6222, + "text_loss": 0.4956454336643219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004003876470364075, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10037312.0, + "repeat_count": 0.0, + "routers_loss": 0.008481289260089397, + "skip_count": 2.0, + "step": 6224, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0004000843542435315, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 10040393.0, + "repeat_count": 0.0, + "routers_loss": 0.002235144842416048, + "skip_count": 0.0, + "step": 6226, + "text_loss": 0.17645306885242462 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003997810997385195, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10044386.0, + "repeat_count": 1.0, + "routers_loss": 0.004541373811662197, + "skip_count": 0.0, + "step": 6228, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00039947788363757915, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 10049046.0, + "repeat_count": 0.0, + "routers_loss": 0.0019183673430234194, + "skip_count": 1.0, + "step": 6230, + "text_loss": 0.6953724026679993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00039917470605690334, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 10051787.0, + "repeat_count": 2.0, + "routers_loss": 0.0032311067916452885, + "skip_count": 4.0, + "step": 6232, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.267684179630173, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00039887156711267043, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 10055396.0, + "repeat_count": 2.0, + "routers_loss": 0.03247373178601265, + "skip_count": 0.0, + "step": 6234, + "text_loss": 0.4239100515842438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00039856846692104363, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10058395.0, + "repeat_count": 0.0, + "routers_loss": 0.006287421099841595, + "skip_count": 3.0, + "step": 6236, + "text_loss": 0.24084535241127014 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.0003982654055981718, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10061302.0, + "repeat_count": 1.0, + "routers_loss": 0.0008686117362231016, + "skip_count": 1.0, + "step": 6238, + "text_loss": 0.4740419089794159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0003979623832601884, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10065318.0, + "repeat_count": 0.0, + "routers_loss": 0.0037686119321733713, + "skip_count": 2.0, + "step": 6240, + "text_loss": 0.43965795636177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0003976594000232123, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10068291.0, + "repeat_count": 0.0, + "routers_loss": 0.005804901942610741, + "skip_count": 0.0, + "step": 6242, + "text_loss": 0.24424348771572113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00039735645600334714, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10071645.0, + "repeat_count": 0.0, + "routers_loss": 0.002001055981963873, + "skip_count": 1.0, + "step": 6244, + "text_loss": 0.6524377465248108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0003970535513166815, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10075136.0, + "repeat_count": 0.0, + "routers_loss": 0.001252001617103815, + "skip_count": 0.0, + "step": 6246, + "text_loss": 0.22803714871406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0003967506860792893, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10078230.0, + "repeat_count": 0.0, + "routers_loss": 0.004913780372589827, + "skip_count": 1.0, + "step": 6248, + "text_loss": 0.9835516214370728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.000396447860407229, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10080852.0, + "repeat_count": 0.0, + "routers_loss": 0.0037437966093420982, + "skip_count": 2.0, + "step": 6250, + "text_loss": 0.4021640121936798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00039614507441654393, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10084139.0, + "repeat_count": 0.0, + "routers_loss": 0.005433002021163702, + "skip_count": 2.0, + "step": 6252, + "text_loss": 0.23060470819473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00039584232822326224, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10088501.0, + "repeat_count": 0.0, + "routers_loss": 0.0007705377647653222, + "skip_count": 0.0, + "step": 6254, + "text_loss": 0.5994830131530762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003955396219433969, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10091506.0, + "repeat_count": 0.0, + "routers_loss": 0.0012310115853324533, + "skip_count": 0.0, + "step": 6256, + "text_loss": 0.4639038145542145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0003952369556929455, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10096236.0, + "repeat_count": 0.0, + "routers_loss": 0.008964627049863338, + "skip_count": 2.0, + "step": 6258, + "text_loss": 0.24845287203788757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003949343295878903, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10099213.0, + "repeat_count": 0.0, + "routers_loss": 0.0033088945783674717, + "skip_count": 0.0, + "step": 6260, + "text_loss": 0.6527073979377747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00039463174374419817, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10103160.0, + "repeat_count": 2.0, + "routers_loss": 0.003462672932073474, + "skip_count": 1.0, + "step": 6262, + "text_loss": 0.4209299683570862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00039432919827782066, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10105881.0, + "repeat_count": 2.0, + "routers_loss": 0.0027124532498419285, + "skip_count": 2.0, + "step": 6264, + "text_loss": 0.4442266821861267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00039402669330469367, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10108596.0, + "repeat_count": 0.0, + "routers_loss": 0.005055282264947891, + "skip_count": 2.0, + "step": 6266, + "text_loss": 0.3331456780433655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00039372422894073765, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10111673.0, + "repeat_count": 0.0, + "routers_loss": 0.0009340311517007649, + "skip_count": 0.0, + "step": 6268, + "text_loss": 0.7664456367492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00039342180530185745, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10116141.0, + "repeat_count": 0.0, + "routers_loss": 0.00032052272581495345, + "skip_count": 0.0, + "step": 6270, + "text_loss": 0.47610244154930115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00039311942250394274, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10119151.0, + "repeat_count": 0.0, + "routers_loss": 0.0015820999396964908, + "skip_count": 0.0, + "step": 6272, + "text_loss": 0.3815282881259918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003928170806628669, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10122684.0, + "repeat_count": 0.0, + "routers_loss": 0.0007423736387863755, + "skip_count": 0.0, + "step": 6274, + "text_loss": 0.4630914628505707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00039251477989448797, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10126751.0, + "repeat_count": 0.0, + "routers_loss": 0.0006216703332029283, + "skip_count": 0.0, + "step": 6276, + "text_loss": 0.4342454671859741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.00039221252031464816, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10129784.0, + "repeat_count": 0.0, + "routers_loss": 0.004239698871970177, + "skip_count": 3.0, + "step": 6278, + "text_loss": 0.24661089479923248 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 29.4837100088054, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003919103020391738, + "loss": 0.006, + "macro_f1": 0.8803418874740601, + "num_tokens": 10133066.0, + "repeat_count": 2.0, + "routers_loss": 0.027879100292921066, + "skip_count": 7.0, + "step": 6280, + "text_loss": 0.4705188274383545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00039160812518387574, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 10136860.0, + "repeat_count": 0.0, + "routers_loss": 0.002533538034185767, + "skip_count": 0.0, + "step": 6282, + "text_loss": 0.1953880786895752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00039130598986454845, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 10140066.0, + "repeat_count": 1.0, + "routers_loss": 0.002462630858644843, + "skip_count": 2.0, + "step": 6284, + "text_loss": 0.378487765789032 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.000391003896196971, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 10143646.0, + "repeat_count": 1.0, + "routers_loss": 0.011922914534807205, + "skip_count": 1.0, + "step": 6286, + "text_loss": 0.2467316836118698 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00039070184429690607, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10146507.0, + "repeat_count": 1.0, + "routers_loss": 0.0059767309576272964, + "skip_count": 1.0, + "step": 6288, + "text_loss": 0.9603674411773682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003903998342801006, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10149301.0, + "repeat_count": 1.0, + "routers_loss": 0.0030056277755647898, + "skip_count": 2.0, + "step": 6290, + "text_loss": 0.36631715297698975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00039009786626228543, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10152158.0, + "repeat_count": 0.0, + "routers_loss": 0.005298118572682142, + "skip_count": 3.0, + "step": 6292, + "text_loss": 0.2876455783843994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003897959403591751, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 10155852.0, + "repeat_count": 0.0, + "routers_loss": 0.004937763791531324, + "skip_count": 2.0, + "step": 6294, + "text_loss": 0.14649681746959686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003894940566864683, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 10159164.0, + "repeat_count": 0.0, + "routers_loss": 0.0021474575623869896, + "skip_count": 0.0, + "step": 6296, + "text_loss": 0.5694304704666138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 29.568241855004402, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.00038919221535984753, + "loss": 0.0073, + "macro_f1": 0.875, + "num_tokens": 10161806.0, + "repeat_count": 1.0, + "routers_loss": 0.040340203791856766, + "skip_count": 3.0, + "step": 6298, + "text_loss": 0.1574537754058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038889041649497894, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10165669.0, + "repeat_count": 0.0, + "routers_loss": 0.0028486931696534157, + "skip_count": 0.0, + "step": 6300, + "text_loss": 0.9158071279525757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003885886602075123, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10168945.0, + "repeat_count": 0.0, + "routers_loss": 0.006565484683960676, + "skip_count": 2.0, + "step": 6302, + "text_loss": 0.3530846834182739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038828694661308116, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10171914.0, + "repeat_count": 0.0, + "routers_loss": 0.0009084723424166441, + "skip_count": 0.0, + "step": 6304, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0003879852758273029, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10175737.0, + "repeat_count": 1.0, + "routers_loss": 0.004121702630072832, + "skip_count": 2.0, + "step": 6306, + "text_loss": 0.5294032096862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00038768364796577814, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10178543.0, + "repeat_count": 0.0, + "routers_loss": 0.0013208909658715129, + "skip_count": 0.0, + "step": 6308, + "text_loss": 0.41084006428718567 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.62459641913707, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00038738206314409144, + "loss": 0.0079, + "macro_f1": 0.9247862696647644, + "num_tokens": 10181880.0, + "repeat_count": 3.0, + "routers_loss": 0.03674180060625076, + "skip_count": 6.0, + "step": 6310, + "text_loss": 0.6920746564865112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0003870805214778106, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10185173.0, + "repeat_count": 0.0, + "routers_loss": 0.00221974472515285, + "skip_count": 2.0, + "step": 6312, + "text_loss": 0.1376657634973526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003867790230824869, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10188642.0, + "repeat_count": 0.0, + "routers_loss": 0.001809283159673214, + "skip_count": 0.0, + "step": 6314, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003864775680736552, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10191750.0, + "repeat_count": 0.0, + "routers_loss": 0.0013956360053271055, + "skip_count": 0.0, + "step": 6316, + "text_loss": 0.4109838902950287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00038617615656683356, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10194578.0, + "repeat_count": 0.0, + "routers_loss": 0.002947692759335041, + "skip_count": 2.0, + "step": 6318, + "text_loss": 0.4818590581417084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003858747886775232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10197131.0, + "repeat_count": 0.0, + "routers_loss": 0.0008140999125316739, + "skip_count": 2.0, + "step": 6320, + "text_loss": 0.4004709720611572 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003855734645212093, + "loss": 0.0089, + "macro_f1": 0.8820862174034119, + "num_tokens": 10199965.0, + "repeat_count": 2.0, + "routers_loss": 0.013056626543402672, + "skip_count": 2.0, + "step": 6322, + "text_loss": 0.3367139995098114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00038527218421335977, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 10203184.0, + "repeat_count": 1.0, + "routers_loss": 0.0038112467154860497, + "skip_count": 2.0, + "step": 6324, + "text_loss": 0.5747989416122437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003849709478694255, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 10206436.0, + "repeat_count": 0.0, + "routers_loss": 0.001232540002092719, + "skip_count": 0.0, + "step": 6326, + "text_loss": 0.4981732964515686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00038466975560484115, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10209889.0, + "repeat_count": 0.0, + "routers_loss": 0.004343799781054258, + "skip_count": 0.0, + "step": 6328, + "text_loss": 0.2160186469554901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.000384368607535024, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10212520.0, + "repeat_count": 0.0, + "routers_loss": 0.0014161963481456041, + "skip_count": 1.0, + "step": 6330, + "text_loss": 0.3556232154369354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0003840675037753745, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10215456.0, + "repeat_count": 0.0, + "routers_loss": 0.0014989010524004698, + "skip_count": 0.0, + "step": 6332, + "text_loss": 0.8510926961898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003837664444412762, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10218558.0, + "repeat_count": 0.0, + "routers_loss": 0.006702739745378494, + "skip_count": 0.0, + "step": 6334, + "text_loss": 0.3995226323604584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0003834654296480958, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10221862.0, + "repeat_count": 0.0, + "routers_loss": 0.00826781615614891, + "skip_count": 2.0, + "step": 6336, + "text_loss": 0.3534671664237976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003831644595111825, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10224820.0, + "repeat_count": 0.0, + "routers_loss": 0.002143894787877798, + "skip_count": 0.0, + "step": 6338, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 29.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04736328125, + "learning_rate": 0.0003828635341458687, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 10227479.0, + "repeat_count": 0.0, + "routers_loss": 0.012319118715822697, + "skip_count": 2.0, + "step": 6340, + "text_loss": 0.26248639822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003825626536674697, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10231347.0, + "repeat_count": 0.0, + "routers_loss": 0.00334449321962893, + "skip_count": 0.0, + "step": 6342, + "text_loss": 0.6357201337814331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.000382261818191283, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10234347.0, + "repeat_count": 0.0, + "routers_loss": 0.0027788348961621523, + "skip_count": 0.0, + "step": 6344, + "text_loss": 0.2813846468925476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00038196102783258996, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10237105.0, + "repeat_count": 0.0, + "routers_loss": 0.001545077539049089, + "skip_count": 0.0, + "step": 6346, + "text_loss": 0.47612661123275757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0003816602827066537, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10240249.0, + "repeat_count": 0.0, + "routers_loss": 0.005602670833468437, + "skip_count": 2.0, + "step": 6348, + "text_loss": 0.18197228014469147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003813595829287204, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10243417.0, + "repeat_count": 0.0, + "routers_loss": 0.0004317959537729621, + "skip_count": 0.0, + "step": 6350, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0003810589286140186, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 10246824.0, + "repeat_count": 0.0, + "routers_loss": 0.002225276781246066, + "skip_count": 0.0, + "step": 6352, + "text_loss": 0.14129821956157684 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.831229820956853, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003807583198777599, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 10249836.0, + "repeat_count": 3.0, + "routers_loss": 0.02445496805012226, + "skip_count": 1.0, + "step": 6354, + "text_loss": 0.3237064480781555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00038045775683513786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10252900.0, + "repeat_count": 0.0, + "routers_loss": 0.0009264222462661564, + "skip_count": 0.0, + "step": 6356, + "text_loss": 0.6777551174163818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0003801572396013289, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10255526.0, + "repeat_count": 1.0, + "routers_loss": 0.007189550437033176, + "skip_count": 5.0, + "step": 6358, + "text_loss": 0.25438982248306274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00037985676829149187, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10258865.0, + "repeat_count": 0.0, + "routers_loss": 0.0014201018493622541, + "skip_count": 0.0, + "step": 6360, + "text_loss": 0.5063154101371765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0003795563430207678, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10261677.0, + "repeat_count": 0.0, + "routers_loss": 0.0035477925557643175, + "skip_count": 3.0, + "step": 6362, + "text_loss": 0.4815357029438019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.878191957734078, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003792559639042803, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 10264805.0, + "repeat_count": 0.0, + "routers_loss": 0.013723359443247318, + "skip_count": 1.0, + "step": 6364, + "text_loss": 0.5563676357269287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003789556310571351, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10267885.0, + "repeat_count": 0.0, + "routers_loss": 0.0028159532230347395, + "skip_count": 0.0, + "step": 6366, + "text_loss": 0.7284183502197266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003786553445944204, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10270934.0, + "repeat_count": 0.0, + "routers_loss": 0.0005918835522606969, + "skip_count": 0.0, + "step": 6368, + "text_loss": 0.7387746572494507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0003783551046312067, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10273818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416864581406116, + "skip_count": 0.0, + "step": 6370, + "text_loss": 0.5360285043716431 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037805491128254645, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 10276494.0, + "repeat_count": 2.0, + "routers_loss": 0.002382483799010515, + "skip_count": 1.0, + "step": 6372, + "text_loss": 0.7536854147911072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00037775476466347414, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10279719.0, + "repeat_count": 0.0, + "routers_loss": 0.0021104486659169197, + "skip_count": 1.0, + "step": 6374, + "text_loss": 0.6807253956794739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0003774546648890066, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 10283000.0, + "repeat_count": 0.0, + "routers_loss": 0.003148776013404131, + "skip_count": 2.0, + "step": 6376, + "text_loss": 0.30774110555648804 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003771546120741426, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10285666.0, + "repeat_count": 1.0, + "routers_loss": 0.007700880523771048, + "skip_count": 1.0, + "step": 6378, + "text_loss": 0.4476076364517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003768546063338631, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10289127.0, + "repeat_count": 0.0, + "routers_loss": 0.0023625255562365055, + "skip_count": 1.0, + "step": 6380, + "text_loss": 0.4350969195365906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0003765546477831307, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10292485.0, + "repeat_count": 0.0, + "routers_loss": 0.001428726245649159, + "skip_count": 0.0, + "step": 6382, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003762547365368902, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10295361.0, + "repeat_count": 0.0, + "routers_loss": 0.0027160397730767727, + "skip_count": 2.0, + "step": 6384, + "text_loss": 0.3476370573043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00037595487271006807, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10298717.0, + "repeat_count": 0.0, + "routers_loss": 0.002456068294122815, + "skip_count": 0.0, + "step": 6386, + "text_loss": 0.3634916841983795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.99090108599941, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.021240234375, + "learning_rate": 0.0003756550564175727, + "loss": 0.0049, + "macro_f1": 0.9265305995941162, + "num_tokens": 10302102.0, + "repeat_count": 1.0, + "routers_loss": 0.02546076290309429, + "skip_count": 3.0, + "step": 6388, + "text_loss": 0.2422582060098648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00037535528777429426, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10305060.0, + "repeat_count": 0.0, + "routers_loss": 0.001045907847583294, + "skip_count": 0.0, + "step": 6390, + "text_loss": 0.5563194155693054 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0003750555668951045, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10307903.0, + "repeat_count": 1.0, + "routers_loss": 0.007391332648694515, + "skip_count": 2.0, + "step": 6392, + "text_loss": 0.3423991799354553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00037475589389485744, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 10311396.0, + "repeat_count": 1.0, + "routers_loss": 0.0029360291082412004, + "skip_count": 1.0, + "step": 6394, + "text_loss": 0.9877024292945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00037445626888838807, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10314250.0, + "repeat_count": 0.0, + "routers_loss": 0.0014932662015780807, + "skip_count": 0.0, + "step": 6396, + "text_loss": 0.3978523313999176 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003741566919905133, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10316894.0, + "repeat_count": 1.0, + "routers_loss": 0.007003722712397575, + "skip_count": 5.0, + "step": 6398, + "text_loss": 0.2945566475391388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00037385716331603155, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10319603.0, + "repeat_count": 1.0, + "routers_loss": 0.006710570305585861, + "skip_count": 1.0, + "step": 6400, + "text_loss": 0.2984389662742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00037355768297972275, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10322670.0, + "repeat_count": 0.0, + "routers_loss": 0.00048738415353000164, + "skip_count": 0.0, + "step": 6402, + "text_loss": 0.483262300491333 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00037325825109634837, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10326280.0, + "repeat_count": 1.0, + "routers_loss": 0.001625525183044374, + "skip_count": 1.0, + "step": 6404, + "text_loss": 0.42678722739219666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0003729588677806513, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10329008.0, + "repeat_count": 0.0, + "routers_loss": 0.004408636130392551, + "skip_count": 0.0, + "step": 6406, + "text_loss": 0.2264070063829422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0003726595331473557, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10332533.0, + "repeat_count": 0.0, + "routers_loss": 0.0038099216762930155, + "skip_count": 2.0, + "step": 6408, + "text_loss": 0.6670092940330505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003723602473111672, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10335643.0, + "repeat_count": 1.0, + "routers_loss": 0.003097689710557461, + "skip_count": 0.0, + "step": 6410, + "text_loss": 0.45228812098503113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037206101038677274, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10338522.0, + "repeat_count": 0.0, + "routers_loss": 0.005268602631986141, + "skip_count": 1.0, + "step": 6412, + "text_loss": 0.7288079857826233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003717618224888405, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 10341516.0, + "repeat_count": 0.0, + "routers_loss": 0.004640138708055019, + "skip_count": 2.0, + "step": 6414, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00037146268373201954, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10344831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006379318656399846, + "skip_count": 0.0, + "step": 6416, + "text_loss": 0.7864460945129395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003711635942309408, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10348499.0, + "repeat_count": 0.0, + "routers_loss": 0.0004005273221991956, + "skip_count": 0.0, + "step": 6418, + "text_loss": 0.605839192867279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0003708645541002159, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10351722.0, + "repeat_count": 0.0, + "routers_loss": 0.001061634044162929, + "skip_count": 0.0, + "step": 6420, + "text_loss": 0.8226510286331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 30.150278837687114, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003705655634544374, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 10355275.0, + "repeat_count": 0.0, + "routers_loss": 0.013980664312839508, + "skip_count": 2.0, + "step": 6422, + "text_loss": 0.2709597647190094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003702666224081792, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10359702.0, + "repeat_count": 1.0, + "routers_loss": 0.0013196271611377597, + "skip_count": 0.0, + "step": 6424, + "text_loss": 0.6451483368873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00036996773107599604, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10363364.0, + "repeat_count": 0.0, + "routers_loss": 0.0028023163322359324, + "skip_count": 1.0, + "step": 6426, + "text_loss": 0.2770799398422241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0003696688895724235, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10366554.0, + "repeat_count": 0.0, + "routers_loss": 0.0011023655533790588, + "skip_count": 0.0, + "step": 6428, + "text_loss": 0.5466503500938416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0003693700980119784, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10369733.0, + "repeat_count": 0.0, + "routers_loss": 0.00230707717128098, + "skip_count": 0.0, + "step": 6430, + "text_loss": 0.45667049288749695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036907135650915824, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10373382.0, + "repeat_count": 0.0, + "routers_loss": 0.0036784098483622074, + "skip_count": 2.0, + "step": 6432, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00036877266517844115, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10376202.0, + "repeat_count": 0.0, + "routers_loss": 0.0008461157558485866, + "skip_count": 0.0, + "step": 6434, + "text_loss": 0.27238601446151733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0003684740241342863, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10380748.0, + "repeat_count": 0.0, + "routers_loss": 0.0052765593864023685, + "skip_count": 0.0, + "step": 6436, + "text_loss": 0.6182295083999634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00036817543349113355, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10386148.0, + "repeat_count": 1.0, + "routers_loss": 0.005562922917306423, + "skip_count": 2.0, + "step": 6438, + "text_loss": 0.5591027140617371 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003678768933634033, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10389385.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686366491019726, + "skip_count": 0.0, + "step": 6440, + "text_loss": 0.5158660411834717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003675784038654968, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10391893.0, + "repeat_count": 0.0, + "routers_loss": 0.0022222092375159264, + "skip_count": 1.0, + "step": 6442, + "text_loss": 0.2865697741508484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003672799651117958, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 10395082.0, + "repeat_count": 0.0, + "routers_loss": 0.0030799773521721363, + "skip_count": 2.0, + "step": 6444, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003669815772166625, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10398015.0, + "repeat_count": 0.0, + "routers_loss": 0.0035721305757761, + "skip_count": 3.0, + "step": 6446, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00036668324029443975, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10400749.0, + "repeat_count": 0.0, + "routers_loss": 0.00741040613502264, + "skip_count": 4.0, + "step": 6448, + "text_loss": 0.3922366201877594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0003663849544594507, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 10404439.0, + "repeat_count": 0.0, + "routers_loss": 0.002974750241264701, + "skip_count": 2.0, + "step": 6450, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.00036608671982599927, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10408476.0, + "repeat_count": 0.0, + "routers_loss": 0.004810616374015808, + "skip_count": 0.0, + "step": 6452, + "text_loss": 0.3928622305393219 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003657885365083694, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10411533.0, + "repeat_count": 1.0, + "routers_loss": 0.005527745466679335, + "skip_count": 0.0, + "step": 6454, + "text_loss": 0.22816279530525208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.00036549040462082556, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10414501.0, + "repeat_count": 0.0, + "routers_loss": 0.0021297158673405647, + "skip_count": 0.0, + "step": 6456, + "text_loss": 0.20487719774246216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 30.31934253008512, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003651923242776124, + "loss": 0.0082, + "macro_f1": 0.6592592597007751, + "num_tokens": 10418296.0, + "repeat_count": 1.0, + "routers_loss": 0.046412210911512375, + "skip_count": 5.0, + "step": 6458, + "text_loss": 0.2890419065952301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00036489429559295484, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10421211.0, + "repeat_count": 0.0, + "routers_loss": 0.004002603702247143, + "skip_count": 0.0, + "step": 6460, + "text_loss": 0.23165544867515564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003645963186810581, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 10424231.0, + "repeat_count": 0.0, + "routers_loss": 0.003480088198557496, + "skip_count": 1.0, + "step": 6462, + "text_loss": 0.6286683082580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003642983936561075, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10427387.0, + "repeat_count": 0.0, + "routers_loss": 0.009358933195471764, + "skip_count": 2.0, + "step": 6464, + "text_loss": 0.3258316218852997 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.356912239506897, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00036400052063226816, + "loss": 0.0048, + "macro_f1": 0.9539539813995361, + "num_tokens": 10430813.0, + "repeat_count": 5.0, + "routers_loss": 0.03567950055003166, + "skip_count": 5.0, + "step": 6466, + "text_loss": 0.7278715968132019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036370269972368615, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 10434175.0, + "repeat_count": 1.0, + "routers_loss": 0.00226925453171134, + "skip_count": 2.0, + "step": 6468, + "text_loss": 0.5652450919151306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0003634049310444867, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10437393.0, + "repeat_count": 0.0, + "routers_loss": 0.0013644809368997812, + "skip_count": 0.0, + "step": 6470, + "text_loss": 0.5985191464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003631072147087753, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 10440412.0, + "repeat_count": 0.0, + "routers_loss": 0.0003114990540780127, + "skip_count": 0.0, + "step": 6472, + "text_loss": 0.5588209629058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00036280955083063747, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10443471.0, + "repeat_count": 0.0, + "routers_loss": 0.0005486322334036231, + "skip_count": 0.0, + "step": 6474, + "text_loss": 0.6969016194343567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00036251193952413865, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10446548.0, + "repeat_count": 1.0, + "routers_loss": 0.008256378583610058, + "skip_count": 2.0, + "step": 6476, + "text_loss": 0.27083566784858704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0003622143809033239, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10449478.0, + "repeat_count": 0.0, + "routers_loss": 0.001008771825581789, + "skip_count": 0.0, + "step": 6478, + "text_loss": 0.1689433604478836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00036191687508221827, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10453017.0, + "repeat_count": 1.0, + "routers_loss": 0.0014678959269076586, + "skip_count": 0.0, + "step": 6480, + "text_loss": 0.9571998715400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0003616194221748267, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10456061.0, + "repeat_count": 0.0, + "routers_loss": 0.001516164978966117, + "skip_count": 0.0, + "step": 6482, + "text_loss": 0.5750429034233093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0003613220222951335, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10459130.0, + "repeat_count": 0.0, + "routers_loss": 0.0031315975356847048, + "skip_count": 0.0, + "step": 6484, + "text_loss": 0.47120073437690735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0003610246755571029, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10462190.0, + "repeat_count": 0.0, + "routers_loss": 0.0006079549202695489, + "skip_count": 0.0, + "step": 6486, + "text_loss": 0.8426173329353333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000360727382074679, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10465233.0, + "repeat_count": 0.0, + "routers_loss": 0.00596054969355464, + "skip_count": 0.0, + "step": 6488, + "text_loss": 0.18435880541801453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.469621367772234, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00036043014196178463, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 10468135.0, + "repeat_count": 0.0, + "routers_loss": 0.008584967814385891, + "skip_count": 1.0, + "step": 6490, + "text_loss": 0.3827758729457855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00036013295533232344, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10471032.0, + "repeat_count": 2.0, + "routers_loss": 0.005076571833342314, + "skip_count": 5.0, + "step": 6492, + "text_loss": 0.1215854063630104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 30.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003598358223001776, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10474779.0, + "repeat_count": 3.0, + "routers_loss": 0.005972118582576513, + "skip_count": 0.0, + "step": 6494, + "text_loss": 0.22768665850162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003595387429792091, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10478015.0, + "repeat_count": 0.0, + "routers_loss": 0.004733685404062271, + "skip_count": 1.0, + "step": 6496, + "text_loss": 0.5013535618782043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00035924171748325916, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10481113.0, + "repeat_count": 0.0, + "routers_loss": 0.01148980576545, + "skip_count": 2.0, + "step": 6498, + "text_loss": 0.3281762897968292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003589447459261487, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10484049.0, + "repeat_count": 0.0, + "routers_loss": 0.007726775947958231, + "skip_count": 2.0, + "step": 6500, + "text_loss": 0.46294569969177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00035864782842167763, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10487443.0, + "repeat_count": 1.0, + "routers_loss": 0.0013331319205462933, + "skip_count": 0.0, + "step": 6502, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00035835096508362544, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10490535.0, + "repeat_count": 0.0, + "routers_loss": 0.0011629529763013124, + "skip_count": 0.0, + "step": 6504, + "text_loss": 0.40683525800704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00035805415602575054, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10493575.0, + "repeat_count": 0.0, + "routers_loss": 0.004780632443726063, + "skip_count": 0.0, + "step": 6506, + "text_loss": 0.37263134121894836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00035775740136179075, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10496193.0, + "repeat_count": 0.0, + "routers_loss": 0.0018355643842369318, + "skip_count": 0.0, + "step": 6508, + "text_loss": 0.2074306458234787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00035746070120546314, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10500135.0, + "repeat_count": 0.0, + "routers_loss": 0.004067617934197187, + "skip_count": 1.0, + "step": 6510, + "text_loss": 0.26313406229019165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00035716405567046383, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10503533.0, + "repeat_count": 0.0, + "routers_loss": 0.005438363179564476, + "skip_count": 0.0, + "step": 6512, + "text_loss": 0.3448122441768646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00035686746487046767, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 10506207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012895528925582767, + "skip_count": 0.0, + "step": 6514, + "text_loss": 0.43096476793289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003565709289191291, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10509257.0, + "repeat_count": 0.0, + "routers_loss": 0.003141741268336773, + "skip_count": 0.0, + "step": 6516, + "text_loss": 0.22349724173545837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003562744479300811, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10512554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005669888923875988, + "skip_count": 0.0, + "step": 6518, + "text_loss": 0.5319190621376038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00035597802201693587, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10515720.0, + "repeat_count": 0.0, + "routers_loss": 0.0020814717281609774, + "skip_count": 0.0, + "step": 6520, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003556816512932841, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10518517.0, + "repeat_count": 2.0, + "routers_loss": 0.010716461576521397, + "skip_count": 3.0, + "step": 6522, + "text_loss": 0.15843836963176727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0003553853358726959, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10521414.0, + "repeat_count": 0.0, + "routers_loss": 0.0014748790999874473, + "skip_count": 0.0, + "step": 6524, + "text_loss": 0.393892377614975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00035508907586871984, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10524210.0, + "repeat_count": 0.0, + "routers_loss": 0.0004757299611810595, + "skip_count": 0.0, + "step": 6526, + "text_loss": 0.2557907700538635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00035479287139488327, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10527327.0, + "repeat_count": 1.0, + "routers_loss": 0.002445317106321454, + "skip_count": 0.0, + "step": 6528, + "text_loss": 0.48338422179222107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003544967225646922, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10530363.0, + "repeat_count": 0.0, + "routers_loss": 0.0015845977468416095, + "skip_count": 0.0, + "step": 6530, + "text_loss": 0.6474354267120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00035420062949163166, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10533444.0, + "repeat_count": 0.0, + "routers_loss": 0.002190655330196023, + "skip_count": 0.0, + "step": 6532, + "text_loss": 0.3789777457714081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0003539045922891649, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10536711.0, + "repeat_count": 0.0, + "routers_loss": 0.00317079434171319, + "skip_count": 0.0, + "step": 6534, + "text_loss": 0.25758084654808044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00035360861107073394, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 10539849.0, + "repeat_count": 0.0, + "routers_loss": 0.0010938458144664764, + "skip_count": 0.0, + "step": 6536, + "text_loss": 0.9821014404296875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003533126859497592, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10543004.0, + "repeat_count": 0.0, + "routers_loss": 0.003071998478844762, + "skip_count": 2.0, + "step": 6538, + "text_loss": 0.6314182281494141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003530168170396401, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10545965.0, + "repeat_count": 0.0, + "routers_loss": 0.006067665759474039, + "skip_count": 2.0, + "step": 6540, + "text_loss": 0.5021927356719971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.000352721004453754, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10549188.0, + "repeat_count": 0.0, + "routers_loss": 0.0019109295681118965, + "skip_count": 0.0, + "step": 6542, + "text_loss": 0.3008780777454376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00035242524830545683, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10552298.0, + "repeat_count": 0.0, + "routers_loss": 0.007457790896296501, + "skip_count": 3.0, + "step": 6544, + "text_loss": 0.5675695538520813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003521295487080829, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 10555123.0, + "repeat_count": 0.0, + "routers_loss": 0.007243642583489418, + "skip_count": 1.0, + "step": 6546, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00035183390577494476, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10559653.0, + "repeat_count": 0.0, + "routers_loss": 0.004024330526590347, + "skip_count": 0.0, + "step": 6548, + "text_loss": 0.2634682357311249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 0.0003515383196193336, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10563770.0, + "repeat_count": 1.0, + "routers_loss": 0.010837121866643429, + "skip_count": 0.0, + "step": 6550, + "text_loss": 0.1608252227306366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0003512427903545183, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10567117.0, + "repeat_count": 0.0, + "routers_loss": 0.003473864868283272, + "skip_count": 0.0, + "step": 6552, + "text_loss": 0.231611430644989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0003509473180937464, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10570622.0, + "repeat_count": 0.0, + "routers_loss": 0.004441239405423403, + "skip_count": 1.0, + "step": 6554, + "text_loss": 0.3193909227848053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506519029502433, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10573411.0, + "repeat_count": 0.0, + "routers_loss": 0.0008821079391054809, + "skip_count": 0.0, + "step": 6556, + "text_loss": 0.4478783905506134 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0003503565450372128, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10576422.0, + "repeat_count": 1.0, + "routers_loss": 0.0014448441797867417, + "skip_count": 0.0, + "step": 6558, + "text_loss": 0.46065983176231384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003500612444678365, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10579879.0, + "repeat_count": 0.0, + "routers_loss": 0.007939066737890244, + "skip_count": 1.0, + "step": 6560, + "text_loss": 0.3299395740032196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000349766001355274, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10583067.0, + "repeat_count": 0.0, + "routers_loss": 0.010073966346681118, + "skip_count": 2.0, + "step": 6562, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00034947081581266335, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10586276.0, + "repeat_count": 0.0, + "routers_loss": 0.0062315030954778194, + "skip_count": 1.0, + "step": 6564, + "text_loss": 0.22706018388271332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003491756879531201, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10589257.0, + "repeat_count": 3.0, + "routers_loss": 0.0023778853937983513, + "skip_count": 4.0, + "step": 6566, + "text_loss": 0.5567800998687744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003488806178897377, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10592163.0, + "repeat_count": 0.0, + "routers_loss": 0.0004184350254945457, + "skip_count": 0.0, + "step": 6568, + "text_loss": 0.4027897119522095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003485856057355876, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 10595326.0, + "repeat_count": 0.0, + "routers_loss": 0.0035254736430943012, + "skip_count": 1.0, + "step": 6570, + "text_loss": 0.3044572174549103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000348290651603719, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10598236.0, + "repeat_count": 0.0, + "routers_loss": 0.0030894684605300426, + "skip_count": 0.0, + "step": 6572, + "text_loss": 0.23021161556243896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00034799575560715896, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10601653.0, + "repeat_count": 1.0, + "routers_loss": 0.0036557347048074007, + "skip_count": 0.0, + "step": 6574, + "text_loss": 0.5437754392623901 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003477009178589121, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10604581.0, + "repeat_count": 2.0, + "routers_loss": 0.021344119682908058, + "skip_count": 4.0, + "step": 6576, + "text_loss": 0.29078927636146545 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0003474061384719608, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10607676.0, + "repeat_count": 1.0, + "routers_loss": 0.0037169242277741432, + "skip_count": 1.0, + "step": 6578, + "text_loss": 1.1790896654129028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003471114175592649, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10611269.0, + "repeat_count": 2.0, + "routers_loss": 0.005873420741409063, + "skip_count": 4.0, + "step": 6580, + "text_loss": 0.36204129457473755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0003468167552337624, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 10614335.0, + "repeat_count": 1.0, + "routers_loss": 0.01030842587351799, + "skip_count": 2.0, + "step": 6582, + "text_loss": 0.20400437712669373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.00034652215160836826, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10617565.0, + "repeat_count": 0.0, + "routers_loss": 0.0025721401907503605, + "skip_count": 0.0, + "step": 6584, + "text_loss": 0.44676345586776733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00034622760679597507, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10620706.0, + "repeat_count": 0.0, + "routers_loss": 0.005751762073487043, + "skip_count": 1.0, + "step": 6586, + "text_loss": 0.4733653664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00034593312090945306, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10623916.0, + "repeat_count": 0.0, + "routers_loss": 0.0029759553726762533, + "skip_count": 3.0, + "step": 6588, + "text_loss": 0.49876922369003296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003456386940616498, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10628093.0, + "repeat_count": 0.0, + "routers_loss": 0.0010031822603195906, + "skip_count": 0.0, + "step": 6590, + "text_loss": 0.42708611488342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00034534432636539004, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10631739.0, + "repeat_count": 0.0, + "routers_loss": 0.0014793311711400747, + "skip_count": 0.0, + "step": 6592, + "text_loss": 0.18193726241588593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003450500179334762, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10634862.0, + "repeat_count": 0.0, + "routers_loss": 0.0059733521193265915, + "skip_count": 2.0, + "step": 6594, + "text_loss": 0.28596529364585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003447557688786879, + "loss": 0.0043, + "macro_f1": 0.3272727429866791, + "num_tokens": 10637758.0, + "repeat_count": 0.0, + "routers_loss": 0.0076768649742007256, + "skip_count": 1.0, + "step": 6596, + "text_loss": 0.39428210258483887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00034446157931378185, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10640440.0, + "repeat_count": 0.0, + "routers_loss": 0.0015128811355680227, + "skip_count": 0.0, + "step": 6598, + "text_loss": 0.45584383606910706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00034416744935149193, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10643600.0, + "repeat_count": 0.0, + "routers_loss": 0.000757391273509711, + "skip_count": 0.0, + "step": 6600, + "text_loss": 0.503209114074707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003438733791045294, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10646907.0, + "repeat_count": 0.0, + "routers_loss": 0.0025944956578314304, + "skip_count": 2.0, + "step": 6602, + "text_loss": 0.4370735287666321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00034357936868558255, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10649995.0, + "repeat_count": 0.0, + "routers_loss": 0.0006543452036567032, + "skip_count": 0.0, + "step": 6604, + "text_loss": 0.4125586748123169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00034328541820731663, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10653251.0, + "repeat_count": 0.0, + "routers_loss": 0.00027016724925488234, + "skip_count": 1.0, + "step": 6606, + "text_loss": 0.7309898734092712 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.023481068388612, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.020751953125, + "learning_rate": 0.00034299152778237413, + "loss": 0.0062, + "macro_f1": 0.8823530077934265, + "num_tokens": 10657229.0, + "repeat_count": 1.0, + "routers_loss": 0.01905548945069313, + "skip_count": 2.0, + "step": 6608, + "text_loss": 0.42367079854011536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0003426976975233744, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10660524.0, + "repeat_count": 0.0, + "routers_loss": 0.0004718089767266065, + "skip_count": 0.0, + "step": 6610, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00034240392754291343, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10663908.0, + "repeat_count": 1.0, + "routers_loss": 0.0027069442439824343, + "skip_count": 0.0, + "step": 6612, + "text_loss": 0.859471321105957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000342110217953565, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10667814.0, + "repeat_count": 0.0, + "routers_loss": 0.0015497280983254313, + "skip_count": 0.0, + "step": 6614, + "text_loss": 0.18337638676166534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003418165688678788, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10671630.0, + "repeat_count": 0.0, + "routers_loss": 0.0013396464055404067, + "skip_count": 0.0, + "step": 6616, + "text_loss": 0.860016405582428 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003415229803983819, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10675308.0, + "repeat_count": 0.0, + "routers_loss": 0.007542039267718792, + "skip_count": 3.0, + "step": 6618, + "text_loss": 0.15481022000312805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003412294526575779, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10678092.0, + "repeat_count": 0.0, + "routers_loss": 0.002029839437454939, + "skip_count": 2.0, + "step": 6620, + "text_loss": 0.5121933221817017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00034093598575794706, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10681382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013001341139897704, + "skip_count": 0.0, + "step": 6622, + "text_loss": 0.4555061161518097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00034064257981194655, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10684255.0, + "repeat_count": 0.0, + "routers_loss": 0.0007926415419206023, + "skip_count": 0.0, + "step": 6624, + "text_loss": 0.7298227548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003403492349320101, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 10686904.0, + "repeat_count": 0.0, + "routers_loss": 0.0021080176811665297, + "skip_count": 1.0, + "step": 6626, + "text_loss": 0.45434215664863586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.000340055951230548, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10690311.0, + "repeat_count": 0.0, + "routers_loss": 0.004011874087154865, + "skip_count": 0.0, + "step": 6628, + "text_loss": 0.15496443212032318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00033976272881994707, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10693395.0, + "repeat_count": 0.0, + "routers_loss": 0.0031893099658191204, + "skip_count": 2.0, + "step": 6630, + "text_loss": 0.5291517972946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003394695678125708, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 10697046.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124347683042288, + "skip_count": 1.0, + "step": 6632, + "text_loss": 0.2893230617046356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033917646832075886, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10700111.0, + "repeat_count": 0.0, + "routers_loss": 0.002547801472246647, + "skip_count": 0.0, + "step": 6634, + "text_loss": 0.10363512486219406 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 31.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003388834304568275, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 10703939.0, + "repeat_count": 2.0, + "routers_loss": 0.0019040531478822231, + "skip_count": 0.0, + "step": 6636, + "text_loss": 0.5185034275054932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00033859045433306975, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 10707187.0, + "repeat_count": 0.0, + "routers_loss": 0.0074104927480220795, + "skip_count": 2.0, + "step": 6638, + "text_loss": 0.1618153154850006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0003382975400617543, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10710029.0, + "repeat_count": 0.0, + "routers_loss": 0.0013861875049769878, + "skip_count": 1.0, + "step": 6640, + "text_loss": 0.6674485206604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003380046877551266, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10713318.0, + "repeat_count": 0.0, + "routers_loss": 0.0034452753607183695, + "skip_count": 0.0, + "step": 6642, + "text_loss": 0.39299124479293823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003377118975254082, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10716130.0, + "repeat_count": 0.0, + "routers_loss": 0.006802885327488184, + "skip_count": 2.0, + "step": 6644, + "text_loss": 0.12942606210708618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.20193718814206, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003374191694847968, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 10719400.0, + "repeat_count": 1.0, + "routers_loss": 0.03718209266662598, + "skip_count": 2.0, + "step": 6646, + "text_loss": 0.34327754378318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0003371265037454663, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10722108.0, + "repeat_count": 0.0, + "routers_loss": 0.006016947794705629, + "skip_count": 2.0, + "step": 6648, + "text_loss": 0.15644726157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.220722042852948, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00033683390041956663, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 10725709.0, + "repeat_count": 1.0, + "routers_loss": 0.04308273270726204, + "skip_count": 2.0, + "step": 6650, + "text_loss": 0.1875772923231125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 31.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003365413596192243, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 10728717.0, + "repeat_count": 2.0, + "routers_loss": 0.006372809875756502, + "skip_count": 1.0, + "step": 6652, + "text_loss": 0.4948291778564453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00033624888145654137, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10732082.0, + "repeat_count": 0.0, + "routers_loss": 0.0014530479675158858, + "skip_count": 0.0, + "step": 6654, + "text_loss": 0.44932305812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00033595646604359585, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10734663.0, + "repeat_count": 0.0, + "routers_loss": 0.001924810465425253, + "skip_count": 0.0, + "step": 6656, + "text_loss": 0.45626893639564514 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00033566411349244206, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10737470.0, + "repeat_count": 1.0, + "routers_loss": 0.0040014320984482765, + "skip_count": 0.0, + "step": 6658, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00033537182391510996, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10740228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008573737577535212, + "skip_count": 0.0, + "step": 6660, + "text_loss": 0.5626822113990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003350795974236055, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10742883.0, + "repeat_count": 0.0, + "routers_loss": 0.011166860349476337, + "skip_count": 1.0, + "step": 6662, + "text_loss": 0.23357805609703064 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 31.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00033478743412991037, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10746459.0, + "repeat_count": 1.0, + "routers_loss": 0.01719980500638485, + "skip_count": 6.0, + "step": 6664, + "text_loss": 0.150017648935318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033449533414598223, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 10749984.0, + "repeat_count": 0.0, + "routers_loss": 0.0038280142471194267, + "skip_count": 2.0, + "step": 6666, + "text_loss": 0.6312657594680786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033420329758375423, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 10752792.0, + "repeat_count": 0.0, + "routers_loss": 0.0007688060286454856, + "skip_count": 1.0, + "step": 6668, + "text_loss": 0.6794863939285278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00033391132455513537, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10756125.0, + "repeat_count": 0.0, + "routers_loss": 0.003196930279955268, + "skip_count": 2.0, + "step": 6670, + "text_loss": 0.22897565364837646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003336194151720102, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10759296.0, + "repeat_count": 0.0, + "routers_loss": 0.0026212623342871666, + "skip_count": 0.0, + "step": 6672, + "text_loss": 0.5236268639564514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003333275695462391, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10762574.0, + "repeat_count": 0.0, + "routers_loss": 0.007855101488530636, + "skip_count": 2.0, + "step": 6674, + "text_loss": 0.2971038818359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003330357877896577, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10765758.0, + "repeat_count": 0.0, + "routers_loss": 0.004191791173070669, + "skip_count": 2.0, + "step": 6676, + "text_loss": 0.17358586192131042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0003327440700140774, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10769396.0, + "repeat_count": 0.0, + "routers_loss": 0.004101858474314213, + "skip_count": 1.0, + "step": 6678, + "text_loss": 0.28932204842567444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.000332452416331285, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10772605.0, + "repeat_count": 0.0, + "routers_loss": 0.0008305918308906257, + "skip_count": 0.0, + "step": 6680, + "text_loss": 0.47090092301368713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0003321608268530427, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10776576.0, + "repeat_count": 0.0, + "routers_loss": 0.003022305201739073, + "skip_count": 1.0, + "step": 6682, + "text_loss": 0.4467788338661194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033186930169108795, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10779648.0, + "repeat_count": 1.0, + "routers_loss": 0.0021474999375641346, + "skip_count": 0.0, + "step": 6684, + "text_loss": 0.6249470710754395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.00033157784095713417, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 10782665.0, + "repeat_count": 0.0, + "routers_loss": 0.0025120675563812256, + "skip_count": 1.0, + "step": 6686, + "text_loss": 0.6763803958892822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003312864447628695, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10785789.0, + "repeat_count": 0.0, + "routers_loss": 0.0013111691223457456, + "skip_count": 1.0, + "step": 6688, + "text_loss": 0.6609058380126953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00033099511321995744, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 10788846.0, + "repeat_count": 0.0, + "routers_loss": 0.0012354454956948757, + "skip_count": 0.0, + "step": 6690, + "text_loss": 0.4421829283237457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0003307038464400368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10791611.0, + "repeat_count": 0.0, + "routers_loss": 0.0035219944547861814, + "skip_count": 2.0, + "step": 6692, + "text_loss": 0.16222824156284332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00033041264453472153, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10794868.0, + "repeat_count": 1.0, + "routers_loss": 0.0007216202793642879, + "skip_count": 0.0, + "step": 6694, + "text_loss": 0.37388721108436584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 31.436747872028178, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0003301215076156008, + "loss": 0.0063, + "macro_f1": 0.8803418874740601, + "num_tokens": 10797737.0, + "repeat_count": 2.0, + "routers_loss": 0.025403080508112907, + "skip_count": 7.0, + "step": 6696, + "text_loss": 0.5086690187454224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003298304357942389, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10800972.0, + "repeat_count": 0.0, + "routers_loss": 0.010532539337873459, + "skip_count": 2.0, + "step": 6698, + "text_loss": 0.22500646114349365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00032953942918217494, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10803654.0, + "repeat_count": 0.0, + "routers_loss": 0.0009591903653927147, + "skip_count": 0.0, + "step": 6700, + "text_loss": 0.6256277561187744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003292484878909232, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10807506.0, + "repeat_count": 0.0, + "routers_loss": 0.003801517654210329, + "skip_count": 2.0, + "step": 6702, + "text_loss": 0.522081196308136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00032895761203197317, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 10810163.0, + "repeat_count": 0.0, + "routers_loss": 0.002608039416372776, + "skip_count": 2.0, + "step": 6704, + "text_loss": 0.3600201904773712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00032866680171678874, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10813202.0, + "repeat_count": 0.0, + "routers_loss": 0.0026464913971722126, + "skip_count": 0.0, + "step": 6706, + "text_loss": 0.2513798773288727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00032837605705680895, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10816484.0, + "repeat_count": 0.0, + "routers_loss": 0.0027157769072800875, + "skip_count": 0.0, + "step": 6708, + "text_loss": 0.34391456842422485 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0003280853781634481, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 10819794.0, + "repeat_count": 1.0, + "routers_loss": 0.0016086180694401264, + "skip_count": 1.0, + "step": 6710, + "text_loss": 0.6535179615020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003277947651480946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10823033.0, + "repeat_count": 0.0, + "routers_loss": 0.002368347719311714, + "skip_count": 0.0, + "step": 6712, + "text_loss": 0.5596423745155334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0003275042181221119, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10826276.0, + "repeat_count": 0.0, + "routers_loss": 0.003124286886304617, + "skip_count": 0.0, + "step": 6714, + "text_loss": 0.6584402322769165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003272137371968382, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10828846.0, + "repeat_count": 0.0, + "routers_loss": 0.0006088328082114458, + "skip_count": 0.0, + "step": 6716, + "text_loss": 0.4602710008621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00032692332248358645, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10832025.0, + "repeat_count": 0.0, + "routers_loss": 0.002511275466531515, + "skip_count": 2.0, + "step": 6718, + "text_loss": 0.42790886759757996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.000326632974093644, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10835110.0, + "repeat_count": 1.0, + "routers_loss": 0.01076667383313179, + "skip_count": 0.0, + "step": 6720, + "text_loss": 0.5659847855567932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0003263426921382728, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 10838279.0, + "repeat_count": 2.0, + "routers_loss": 0.004973042290657759, + "skip_count": 2.0, + "step": 6722, + "text_loss": 0.675341010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00032605247672870964, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10841381.0, + "repeat_count": 0.0, + "routers_loss": 0.0013990222942084074, + "skip_count": 0.0, + "step": 6724, + "text_loss": 0.5389315485954285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00032576232797616554, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10844583.0, + "repeat_count": 0.0, + "routers_loss": 0.003186358604580164, + "skip_count": 1.0, + "step": 6726, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003254722459918261, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10847670.0, + "repeat_count": 0.0, + "routers_loss": 0.001443870598450303, + "skip_count": 0.0, + "step": 6728, + "text_loss": 0.6922405362129211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0003251822308868512, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10851479.0, + "repeat_count": 0.0, + "routers_loss": 0.004294445738196373, + "skip_count": 0.0, + "step": 6730, + "text_loss": 0.7145437002182007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032489228277237514, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10854489.0, + "repeat_count": 0.0, + "routers_loss": 0.0032078945077955723, + "skip_count": 0.0, + "step": 6732, + "text_loss": 0.4077773094177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032460240175950664, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10856954.0, + "repeat_count": 1.0, + "routers_loss": 0.0038214854430407286, + "skip_count": 2.0, + "step": 6734, + "text_loss": 0.32071781158447266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0003243125879593286, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10860016.0, + "repeat_count": 0.0, + "routers_loss": 0.0013407845981419086, + "skip_count": 0.0, + "step": 6736, + "text_loss": 0.45335495471954346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003240228414828984, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10863021.0, + "repeat_count": 0.0, + "routers_loss": 0.0010989385191351175, + "skip_count": 0.0, + "step": 6738, + "text_loss": 0.562619149684906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003237331624412473, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10866548.0, + "repeat_count": 0.0, + "routers_loss": 0.006139552686363459, + "skip_count": 0.0, + "step": 6740, + "text_loss": 0.14510060846805573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00032344355094538087, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10869402.0, + "repeat_count": 0.0, + "routers_loss": 0.004785746335983276, + "skip_count": 0.0, + "step": 6742, + "text_loss": 0.5655979514122009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00032315400710627876, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10874165.0, + "repeat_count": 0.0, + "routers_loss": 0.0052397786639630795, + "skip_count": 0.0, + "step": 6744, + "text_loss": 0.4785873591899872 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 31.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003228645310348948, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10876919.0, + "repeat_count": 3.0, + "routers_loss": 0.00460197776556015, + "skip_count": 1.0, + "step": 6746, + "text_loss": 0.5683879256248474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0003225751228421566, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10880179.0, + "repeat_count": 0.0, + "routers_loss": 0.0032690472435206175, + "skip_count": 0.0, + "step": 6748, + "text_loss": 0.5268497467041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.00032228578263896607, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10883711.0, + "repeat_count": 0.0, + "routers_loss": 0.0036305058747529984, + "skip_count": 0.0, + "step": 6750, + "text_loss": 0.16675594449043274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003219965105361989, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10887041.0, + "repeat_count": 0.0, + "routers_loss": 0.002453352091833949, + "skip_count": 1.0, + "step": 6752, + "text_loss": 0.7010246515274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00032170730664470465, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10890053.0, + "repeat_count": 0.0, + "routers_loss": 0.0020381701178848743, + "skip_count": 0.0, + "step": 6754, + "text_loss": 0.46637895703315735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003214181710753069, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10893501.0, + "repeat_count": 0.0, + "routers_loss": 0.004525696858763695, + "skip_count": 0.0, + "step": 6756, + "text_loss": 0.1768684983253479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003211291039388026, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10896480.0, + "repeat_count": 1.0, + "routers_loss": 0.0038154330104589462, + "skip_count": 0.0, + "step": 6758, + "text_loss": 0.7908347845077515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00032084010534596326, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10899158.0, + "repeat_count": 0.0, + "routers_loss": 0.004711449146270752, + "skip_count": 2.0, + "step": 6760, + "text_loss": 0.37209007143974304 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003205511754075335, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10901791.0, + "repeat_count": 1.0, + "routers_loss": 0.0025003373157233, + "skip_count": 1.0, + "step": 6762, + "text_loss": 0.8081201314926147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00032026231423423204, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10904817.0, + "repeat_count": 0.0, + "routers_loss": 0.007387075573205948, + "skip_count": 3.0, + "step": 6764, + "text_loss": 0.30355480313301086 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003199735219367507, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 10908018.0, + "repeat_count": 2.0, + "routers_loss": 0.04275592789053917, + "skip_count": 0.0, + "step": 6766, + "text_loss": 0.26562029123306274 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.774875256824185, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003196847986257553, + "loss": 0.008, + "macro_f1": 0.9255813956260681, + "num_tokens": 10911264.0, + "repeat_count": 3.0, + "routers_loss": 0.034824032336473465, + "skip_count": 4.0, + "step": 6768, + "text_loss": 0.2761698067188263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00031939614441188523, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10915964.0, + "repeat_count": 0.0, + "routers_loss": 0.0011179742868989706, + "skip_count": 0.0, + "step": 6770, + "text_loss": 0.4107927083969116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00031910755940575344, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10918678.0, + "repeat_count": 0.0, + "routers_loss": 0.0011521469568833709, + "skip_count": 0.0, + "step": 6772, + "text_loss": 0.43064895272254944 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.000318819043717946, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10921757.0, + "repeat_count": 1.0, + "routers_loss": 0.002861087443307042, + "skip_count": 1.0, + "step": 6774, + "text_loss": 0.5945150852203369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003185305974590229, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10924767.0, + "repeat_count": 0.0, + "routers_loss": 0.0011365334503352642, + "skip_count": 0.0, + "step": 6776, + "text_loss": 0.36615172028541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0003182422207395171, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10927750.0, + "repeat_count": 1.0, + "routers_loss": 0.0034391419030725956, + "skip_count": 0.0, + "step": 6778, + "text_loss": 0.17081251740455627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003179539136699351, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10930817.0, + "repeat_count": 0.0, + "routers_loss": 0.004941808991134167, + "skip_count": 2.0, + "step": 6780, + "text_loss": 0.7683762311935425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.840622248312297, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.00031766567636075675, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 10933882.0, + "repeat_count": 1.0, + "routers_loss": 0.017502857372164726, + "skip_count": 2.0, + "step": 6782, + "text_loss": 0.38010457158088684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003173775089224353, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10936909.0, + "repeat_count": 1.0, + "routers_loss": 0.0035372809506952763, + "skip_count": 2.0, + "step": 6784, + "text_loss": 0.5760656595230103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00031708941146539707, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10940032.0, + "repeat_count": 1.0, + "routers_loss": 0.02229934185743332, + "skip_count": 0.0, + "step": 6786, + "text_loss": 0.5767728090286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00031680138410004123, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10943217.0, + "repeat_count": 0.0, + "routers_loss": 0.0028649091254919767, + "skip_count": 1.0, + "step": 6788, + "text_loss": 0.9756367802619934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00031651342693674066, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10947847.0, + "repeat_count": 0.0, + "routers_loss": 0.0039158593863248825, + "skip_count": 2.0, + "step": 6790, + "text_loss": 0.2504335045814514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000316225540085841, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10950879.0, + "repeat_count": 0.0, + "routers_loss": 0.0022091215942054987, + "skip_count": 0.0, + "step": 6792, + "text_loss": 0.525842547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00031593772365766105, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10954960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006841494468972087, + "skip_count": 0.0, + "step": 6794, + "text_loss": 0.6383582353591919 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.906369239800412, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003156499777624926, + "loss": 0.006, + "macro_f1": 0.9539539813995361, + "num_tokens": 10958278.0, + "repeat_count": 5.0, + "routers_loss": 0.03810702636837959, + "skip_count": 5.0, + "step": 6796, + "text_loss": 0.5901661515235901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0003153623025106005, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10962412.0, + "repeat_count": 0.0, + "routers_loss": 0.00046833412488922477, + "skip_count": 0.0, + "step": 6798, + "text_loss": 0.42693984508514404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00031507469801222233, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10966037.0, + "repeat_count": 0.0, + "routers_loss": 0.006818041671067476, + "skip_count": 2.0, + "step": 6800, + "text_loss": 0.5326262712478638 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00031478716437756876, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10969369.0, + "repeat_count": 0.0, + "routers_loss": 0.0029889161232858896, + "skip_count": 0.0, + "step": 6802, + "text_loss": 0.49028220772743225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003144997017168232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10972016.0, + "repeat_count": 0.0, + "routers_loss": 0.0038266500923782587, + "skip_count": 2.0, + "step": 6804, + "text_loss": 0.43391722440719604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0003142123101401417, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10975153.0, + "repeat_count": 0.0, + "routers_loss": 0.0005866789724677801, + "skip_count": 0.0, + "step": 6806, + "text_loss": 0.5888382196426392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00031392498975765353, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10977881.0, + "repeat_count": 0.0, + "routers_loss": 0.002122384263202548, + "skip_count": 0.0, + "step": 6808, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003136377406794604, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10982025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005535652744583786, + "skip_count": 0.0, + "step": 6810, + "text_loss": 0.5788959264755249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003133505630156365, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10985419.0, + "repeat_count": 0.0, + "routers_loss": 0.010623604990541935, + "skip_count": 2.0, + "step": 6812, + "text_loss": 0.18577243387699127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00031306345687622905, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10989116.0, + "repeat_count": 0.0, + "routers_loss": 0.0004721239674836397, + "skip_count": 0.0, + "step": 6814, + "text_loss": 0.4818301200866699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0003127764223712575, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10992064.0, + "repeat_count": 0.0, + "routers_loss": 0.0004238430701661855, + "skip_count": 0.0, + "step": 6816, + "text_loss": 0.7482771277427673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003124894596107141, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10994903.0, + "repeat_count": 1.0, + "routers_loss": 0.005224394146353006, + "skip_count": 2.0, + "step": 6818, + "text_loss": 0.186603844165802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00031220256870456356, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 10998692.0, + "repeat_count": 1.0, + "routers_loss": 0.0021751862950623035, + "skip_count": 2.0, + "step": 6820, + "text_loss": 0.45633986592292786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00031191574976274284, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11001284.0, + "repeat_count": 0.0, + "routers_loss": 0.004747046157717705, + "skip_count": 4.0, + "step": 6822, + "text_loss": 0.5651670694351196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003116290028951617, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 11004293.0, + "repeat_count": 0.0, + "routers_loss": 0.0008316585444845259, + "skip_count": 0.0, + "step": 6824, + "text_loss": 0.3167279362678528 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.000311342328211702, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11007080.0, + "repeat_count": 0.0, + "routers_loss": 0.0004732926026917994, + "skip_count": 0.0, + "step": 6826, + "text_loss": 0.49171411991119385 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000311055725822218, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11010078.0, + "repeat_count": 1.0, + "routers_loss": 0.004238729365170002, + "skip_count": 0.0, + "step": 6828, + "text_loss": 0.21484950184822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003107691958365361, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11013368.0, + "repeat_count": 0.0, + "routers_loss": 0.0029175232630223036, + "skip_count": 2.0, + "step": 6830, + "text_loss": 0.3718266189098358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003104827383644555, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11016704.0, + "repeat_count": 0.0, + "routers_loss": 0.00191891985014081, + "skip_count": 0.0, + "step": 6832, + "text_loss": 0.28772637248039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00031019635351574705, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 11019651.0, + "repeat_count": 0.0, + "routers_loss": 0.004300855100154877, + "skip_count": 2.0, + "step": 6834, + "text_loss": 0.6583508849143982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000309910041400154, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11023847.0, + "repeat_count": 0.0, + "routers_loss": 0.00037701442488469183, + "skip_count": 0.0, + "step": 6836, + "text_loss": 0.36090534925460815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 32.10331670090989, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0003096238021273917, + "loss": 0.0077, + "macro_f1": 0.9265305995941162, + "num_tokens": 11027804.0, + "repeat_count": 1.0, + "routers_loss": 0.03601725772023201, + "skip_count": 3.0, + "step": 6838, + "text_loss": 0.24180401861667633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.11270912826534, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00030933763580714757, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 11030778.0, + "repeat_count": 1.0, + "routers_loss": 0.023780640214681625, + "skip_count": 2.0, + "step": 6840, + "text_loss": 0.4978102743625641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030905154254908104, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11034863.0, + "repeat_count": 1.0, + "routers_loss": 0.00565778324380517, + "skip_count": 0.0, + "step": 6842, + "text_loss": 0.558772623538971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00030876552246282356, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11038488.0, + "repeat_count": 0.0, + "routers_loss": 0.010575232096016407, + "skip_count": 0.0, + "step": 6844, + "text_loss": 0.2955974340438843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003084795756579787, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11041796.0, + "repeat_count": 0.0, + "routers_loss": 0.0015910190995782614, + "skip_count": 0.0, + "step": 6846, + "text_loss": 0.5009704828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003081937022441217, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11045141.0, + "repeat_count": 0.0, + "routers_loss": 0.0008034126949496567, + "skip_count": 0.0, + "step": 6848, + "text_loss": 0.3965311646461487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003079079023307999, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11047814.0, + "repeat_count": 2.0, + "routers_loss": 0.00810160581022501, + "skip_count": 0.0, + "step": 6850, + "text_loss": 0.24341927468776703 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003076221760275321, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11051330.0, + "repeat_count": 1.0, + "routers_loss": 0.006590691395103931, + "skip_count": 0.0, + "step": 6852, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00030733652344380936, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11055006.0, + "repeat_count": 0.0, + "routers_loss": 0.0005845054984092712, + "skip_count": 0.0, + "step": 6854, + "text_loss": 0.6621366739273071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003070509446890944, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11058470.0, + "repeat_count": 0.0, + "routers_loss": 0.0041051446460187435, + "skip_count": 1.0, + "step": 6856, + "text_loss": 0.31603100895881653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0003067654398728214, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11061620.0, + "repeat_count": 1.0, + "routers_loss": 0.001603201380930841, + "skip_count": 0.0, + "step": 6858, + "text_loss": 0.5167516469955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00030648000910439636, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11064727.0, + "repeat_count": 0.0, + "routers_loss": 0.0024816282093524933, + "skip_count": 0.0, + "step": 6860, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030619465249319693, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11068208.0, + "repeat_count": 1.0, + "routers_loss": 0.003121294779703021, + "skip_count": 0.0, + "step": 6862, + "text_loss": 0.3920222818851471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0003059093701485722, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11071315.0, + "repeat_count": 0.0, + "routers_loss": 0.0033239589538425207, + "skip_count": 1.0, + "step": 6864, + "text_loss": 0.4201887845993042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00030562416217984296, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11074144.0, + "repeat_count": 0.0, + "routers_loss": 0.0016117560444399714, + "skip_count": 0.0, + "step": 6866, + "text_loss": 0.5283045172691345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0003053390286963015, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11077152.0, + "repeat_count": 0.0, + "routers_loss": 0.003879208816215396, + "skip_count": 0.0, + "step": 6868, + "text_loss": 0.16188788414001465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00030505396980721143, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11080200.0, + "repeat_count": 0.0, + "routers_loss": 0.007632353343069553, + "skip_count": 1.0, + "step": 6870, + "text_loss": 0.25986847281455994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00030476898562180793, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11083356.0, + "repeat_count": 0.0, + "routers_loss": 0.004322016146034002, + "skip_count": 2.0, + "step": 6872, + "text_loss": 0.49556297063827515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003044840762492974, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 11086354.0, + "repeat_count": 0.0, + "routers_loss": 0.0031272871419787407, + "skip_count": 2.0, + "step": 6874, + "text_loss": 0.1658666580915451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003041992417988577, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11088850.0, + "repeat_count": 0.0, + "routers_loss": 0.005371398758143187, + "skip_count": 2.0, + "step": 6876, + "text_loss": 0.22437214851379395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003039144823796378, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11091784.0, + "repeat_count": 0.0, + "routers_loss": 0.0025086402893066406, + "skip_count": 0.0, + "step": 6878, + "text_loss": 0.7293354868888855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003036297981007581, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11095204.0, + "repeat_count": 0.0, + "routers_loss": 0.015590827912092209, + "skip_count": 1.0, + "step": 6880, + "text_loss": 0.6406328678131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003033451890713103, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11098367.0, + "repeat_count": 0.0, + "routers_loss": 0.0013142531970515847, + "skip_count": 0.0, + "step": 6882, + "text_loss": 0.5209086537361145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003030606554003571, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 11101047.0, + "repeat_count": 2.0, + "routers_loss": 0.0018484699539840221, + "skip_count": 0.0, + "step": 6884, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00030277619719693217, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11104269.0, + "repeat_count": 0.0, + "routers_loss": 0.0016667681047692895, + "skip_count": 0.0, + "step": 6886, + "text_loss": 0.7918420433998108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0003024918145700406, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 11107248.0, + "repeat_count": 0.0, + "routers_loss": 0.0008098077378235757, + "skip_count": 0.0, + "step": 6888, + "text_loss": 0.3871288299560547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003022075076286582, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 11111204.0, + "repeat_count": 0.0, + "routers_loss": 0.002324736909940839, + "skip_count": 0.0, + "step": 6890, + "text_loss": 0.3722921907901764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003019232764817321, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11114363.0, + "repeat_count": 0.0, + "routers_loss": 0.00254769716411829, + "skip_count": 0.0, + "step": 6892, + "text_loss": 0.418519526720047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00030163912123818006, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11117718.0, + "repeat_count": 0.0, + "routers_loss": 0.000547234492842108, + "skip_count": 0.0, + "step": 6894, + "text_loss": 0.6087009310722351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003013550420068909, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11120437.0, + "repeat_count": 0.0, + "routers_loss": 0.00015221568173728883, + "skip_count": 0.0, + "step": 6896, + "text_loss": 0.6013991832733154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.385089521573235, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046142578125, + "learning_rate": 0.00030107103889672436, + "loss": 0.0085, + "macro_f1": 0.5492662787437439, + "num_tokens": 11123708.0, + "repeat_count": 0.0, + "routers_loss": 0.024048971012234688, + "skip_count": 2.0, + "step": 6898, + "text_loss": 0.3612423837184906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003007871120165111, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 11127294.0, + "repeat_count": 0.0, + "routers_loss": 0.0013236473314464092, + "skip_count": 0.0, + "step": 6900, + "text_loss": 0.5277031064033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00030050326147505226, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11130270.0, + "repeat_count": 0.0, + "routers_loss": 0.0028277861420065165, + "skip_count": 0.0, + "step": 6902, + "text_loss": 0.5726971626281738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003002194873811197, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11132955.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369837388396263, + "skip_count": 0.0, + "step": 6904, + "text_loss": 0.18510448932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00029993578984345673, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 11136387.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351211696863174, + "skip_count": 0.0, + "step": 6906, + "text_loss": 0.28313153982162476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0002996521689707764, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11139740.0, + "repeat_count": 0.0, + "routers_loss": 0.00032925375853665173, + "skip_count": 0.0, + "step": 6908, + "text_loss": 0.7315025329589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002993686248717629, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11142587.0, + "repeat_count": 0.0, + "routers_loss": 0.002886304398998618, + "skip_count": 0.0, + "step": 6910, + "text_loss": 0.677378237247467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029908515765507084, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 11145415.0, + "repeat_count": 1.0, + "routers_loss": 0.0038471966981887817, + "skip_count": 0.0, + "step": 6912, + "text_loss": 0.5207083225250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002988017674293254, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11148524.0, + "repeat_count": 0.0, + "routers_loss": 0.0023522782139480114, + "skip_count": 0.0, + "step": 6914, + "text_loss": 0.42507871985435486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0002985184543031222, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11152069.0, + "repeat_count": 0.0, + "routers_loss": 0.0012464249739423394, + "skip_count": 0.0, + "step": 6916, + "text_loss": 0.5694169998168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0002982352183850274, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11155675.0, + "repeat_count": 0.0, + "routers_loss": 0.00828156154602766, + "skip_count": 2.0, + "step": 6918, + "text_loss": 0.22304373979568481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00029795205978357754, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11158555.0, + "repeat_count": 0.0, + "routers_loss": 0.0019234733190387487, + "skip_count": 0.0, + "step": 6920, + "text_loss": 0.5519064664840698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0002976689786072795, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11161407.0, + "repeat_count": 0.0, + "routers_loss": 0.0003542431222740561, + "skip_count": 0.0, + "step": 6922, + "text_loss": 0.6748810410499573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002973859749646104, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11166007.0, + "repeat_count": 0.0, + "routers_loss": 0.0004024899681098759, + "skip_count": 0.0, + "step": 6924, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 32.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000297103048964018, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 11169007.0, + "repeat_count": 0.0, + "routers_loss": 0.005519595462828875, + "skip_count": 3.0, + "step": 6926, + "text_loss": 0.3815552592277527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00029682020071392, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11172939.0, + "repeat_count": 0.0, + "routers_loss": 0.0016999440267682076, + "skip_count": 0.0, + "step": 6928, + "text_loss": 0.6727893352508545 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.535368359260346, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002965374303227044, + "loss": 0.0055, + "macro_f1": 0.5492662787437439, + "num_tokens": 11176232.0, + "repeat_count": 2.0, + "routers_loss": 0.030950307846069336, + "skip_count": 0.0, + "step": 6930, + "text_loss": 0.5577763915061951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029625473789872923, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11179775.0, + "repeat_count": 0.0, + "routers_loss": 0.00525702815502882, + "skip_count": 1.0, + "step": 6932, + "text_loss": 0.5860039591789246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.000295972123550323, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11183262.0, + "repeat_count": 1.0, + "routers_loss": 0.0048187971115112305, + "skip_count": 2.0, + "step": 6934, + "text_loss": 0.7328732013702393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.00029568958738578364, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11186591.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159632312133908, + "skip_count": 0.0, + "step": 6936, + "text_loss": 0.40563541650772095 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 0.0002954071295133801, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11190056.0, + "repeat_count": 1.0, + "routers_loss": 0.011282073333859444, + "skip_count": 1.0, + "step": 6938, + "text_loss": 0.15986496210098267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002951247500413504, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11193504.0, + "repeat_count": 3.0, + "routers_loss": 0.010220487602055073, + "skip_count": 5.0, + "step": 6940, + "text_loss": 0.2604432702064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002948424490779029, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11196725.0, + "repeat_count": 0.0, + "routers_loss": 0.002620660001412034, + "skip_count": 1.0, + "step": 6942, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029456022673121597, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11199303.0, + "repeat_count": 0.0, + "routers_loss": 0.00042651945841498673, + "skip_count": 0.0, + "step": 6944, + "text_loss": 0.5135554671287537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0002942780831094377, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11202319.0, + "repeat_count": 0.0, + "routers_loss": 0.005366047378629446, + "skip_count": 2.0, + "step": 6946, + "text_loss": 0.2809196710586548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002939960183206861, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 11205622.0, + "repeat_count": 0.0, + "routers_loss": 0.0033479216508567333, + "skip_count": 0.0, + "step": 6948, + "text_loss": 0.2013140618801117 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00029371403247304887, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11208637.0, + "repeat_count": 1.0, + "routers_loss": 0.0013508419506251812, + "skip_count": 0.0, + "step": 6950, + "text_loss": 0.4427332580089569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002934321256745833, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11211618.0, + "repeat_count": 0.0, + "routers_loss": 0.0020944071002304554, + "skip_count": 0.0, + "step": 6952, + "text_loss": 0.5406652688980103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00029315029803331704, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11214432.0, + "repeat_count": 0.0, + "routers_loss": 0.0012655078899115324, + "skip_count": 0.0, + "step": 6954, + "text_loss": 0.7720552086830139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00029286854965724686, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11218127.0, + "repeat_count": 0.0, + "routers_loss": 0.009041395038366318, + "skip_count": 0.0, + "step": 6956, + "text_loss": 0.258109986782074 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0002925868806543391, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 11221440.0, + "repeat_count": 1.0, + "routers_loss": 0.0034558263141661882, + "skip_count": 1.0, + "step": 6958, + "text_loss": 0.5378029942512512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00029230529113253, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11225391.0, + "repeat_count": 0.0, + "routers_loss": 0.005263930186629295, + "skip_count": 2.0, + "step": 6960, + "text_loss": 0.3616539537906647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0002920237811997251, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11228648.0, + "repeat_count": 0.0, + "routers_loss": 0.003730480559170246, + "skip_count": 1.0, + "step": 6962, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00029174235096379963, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11231828.0, + "repeat_count": 0.0, + "routers_loss": 0.004831735976040363, + "skip_count": 1.0, + "step": 6964, + "text_loss": 0.5718355178833008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.70443205165835, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046875, + "learning_rate": 0.0002914610005325981, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 11234984.0, + "repeat_count": 0.0, + "routers_loss": 0.03880132734775543, + "skip_count": 2.0, + "step": 6966, + "text_loss": 0.3139013946056366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002911797300139345, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 11239153.0, + "repeat_count": 0.0, + "routers_loss": 0.0006673726020380855, + "skip_count": 0.0, + "step": 6968, + "text_loss": 0.6040399074554443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029089853951559235, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11242178.0, + "repeat_count": 1.0, + "routers_loss": 0.0028971200808882713, + "skip_count": 0.0, + "step": 6970, + "text_loss": 0.304967999458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00029061742914532427, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11245865.0, + "repeat_count": 0.0, + "routers_loss": 0.0010410466929897666, + "skip_count": 0.0, + "step": 6972, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0002903363990108524, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11248806.0, + "repeat_count": 0.0, + "routers_loss": 0.002133697969838977, + "skip_count": 0.0, + "step": 6974, + "text_loss": 0.2561415433883667 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0002900554492198677, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 11251807.0, + "repeat_count": 2.0, + "routers_loss": 0.002402493730187416, + "skip_count": 0.0, + "step": 6976, + "text_loss": 0.652428388595581 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0002897745798800311, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 11254615.0, + "repeat_count": 1.0, + "routers_loss": 0.006423915736377239, + "skip_count": 0.0, + "step": 6978, + "text_loss": 0.22414511442184448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.000289493791098972, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11257721.0, + "repeat_count": 0.0, + "routers_loss": 0.002536606043577194, + "skip_count": 0.0, + "step": 6980, + "text_loss": 0.1328018754720688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00028921308298428933, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11260840.0, + "repeat_count": 0.0, + "routers_loss": 0.000745086173992604, + "skip_count": 0.0, + "step": 6982, + "text_loss": 0.61724853515625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0002889324556435509, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11264279.0, + "repeat_count": 0.0, + "routers_loss": 0.005258981604129076, + "skip_count": 0.0, + "step": 6984, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028865190918429356, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11268096.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756023598834872, + "skip_count": 0.0, + "step": 6986, + "text_loss": 0.45111921429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00028837144371402336, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11270611.0, + "repeat_count": 0.0, + "routers_loss": 0.0008175788098014891, + "skip_count": 0.0, + "step": 6988, + "text_loss": 0.5332239270210266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00028809105934021517, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11273826.0, + "repeat_count": 0.0, + "routers_loss": 0.003494064789265394, + "skip_count": 0.0, + "step": 6990, + "text_loss": 0.20264241099357605 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002878107561703127, + "loss": 0.0056, + "macro_f1": 0.8817967176437378, + "num_tokens": 11276917.0, + "repeat_count": 2.0, + "routers_loss": 0.025257345288991928, + "skip_count": 3.0, + "step": 6992, + "text_loss": 0.18000070750713348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.835926034634575, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0002875305343117289, + "loss": 0.0044, + "macro_f1": 0.6603773832321167, + "num_tokens": 11279637.0, + "repeat_count": 1.0, + "routers_loss": 0.019206687808036804, + "skip_count": 1.0, + "step": 6994, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00028725039387184504, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11282717.0, + "repeat_count": 0.0, + "routers_loss": 0.009358765557408333, + "skip_count": 1.0, + "step": 6996, + "text_loss": 0.3412095904350281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00028697033495801163, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11285433.0, + "repeat_count": 1.0, + "routers_loss": 0.0038775671273469925, + "skip_count": 1.0, + "step": 6998, + "text_loss": 0.4316727817058563 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002866903576775475, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11288414.0, + "repeat_count": 1.0, + "routers_loss": 0.004292591474950314, + "skip_count": 0.0, + "step": 7000, + "text_loss": 0.45106515288352966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.873495744056356, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0002864104621377409, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 11291811.0, + "repeat_count": 1.0, + "routers_loss": 0.02195967361330986, + "skip_count": 2.0, + "step": 7002, + "text_loss": 0.29841285943984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002861306484458481, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11295179.0, + "repeat_count": 0.0, + "routers_loss": 0.0010119527578353882, + "skip_count": 0.0, + "step": 7004, + "text_loss": 0.5218569040298462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028585091670909436, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11298182.0, + "repeat_count": 0.0, + "routers_loss": 0.002615996403619647, + "skip_count": 0.0, + "step": 7006, + "text_loss": 0.20382621884346008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028557126703467316, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 11301262.0, + "repeat_count": 0.0, + "routers_loss": 0.002726050792261958, + "skip_count": 0.0, + "step": 7008, + "text_loss": 0.26718559861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002852916995297471, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 11304590.0, + "repeat_count": 0.0, + "routers_loss": 0.0005590448854491115, + "skip_count": 0.0, + "step": 7010, + "text_loss": 0.5392091274261475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028501221430144667, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11307690.0, + "repeat_count": 0.0, + "routers_loss": 0.004541353322565556, + "skip_count": 2.0, + "step": 7012, + "text_loss": 0.16159705817699432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00028473281145687137, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11310866.0, + "repeat_count": 0.0, + "routers_loss": 0.0029630991630256176, + "skip_count": 1.0, + "step": 7014, + "text_loss": 0.9148072600364685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 32.93924273554447, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002844534911030888, + "loss": 0.0067, + "macro_f1": 0.9262410998344421, + "num_tokens": 11314517.0, + "repeat_count": 2.0, + "routers_loss": 0.023258809000253677, + "skip_count": 3.0, + "step": 7016, + "text_loss": 0.3853590488433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000284174253347135, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 11317526.0, + "repeat_count": 0.0, + "routers_loss": 0.010060093365609646, + "skip_count": 1.0, + "step": 7018, + "text_loss": 0.3412325382232666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00028389509829601444, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 11321684.0, + "repeat_count": 0.0, + "routers_loss": 0.0016713893273845315, + "skip_count": 0.0, + "step": 7020, + "text_loss": 0.9049796462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00028361602605670003, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11324709.0, + "repeat_count": 0.0, + "routers_loss": 0.004167001228779554, + "skip_count": 2.0, + "step": 7022, + "text_loss": 0.24364058673381805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00028333703673613224, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11327449.0, + "repeat_count": 0.0, + "routers_loss": 0.0027954576071351767, + "skip_count": 4.0, + "step": 7024, + "text_loss": 0.2872125506401062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00028305813044122096, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11330846.0, + "repeat_count": 0.0, + "routers_loss": 0.004644687287509441, + "skip_count": 0.0, + "step": 7026, + "text_loss": 0.1717570424079895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.00028277930727884336, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11333575.0, + "repeat_count": 0.0, + "routers_loss": 0.00557848671451211, + "skip_count": 2.0, + "step": 7028, + "text_loss": 0.3501792550086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028250056735584496, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11336899.0, + "repeat_count": 0.0, + "routers_loss": 0.0005694970604963601, + "skip_count": 0.0, + "step": 7030, + "text_loss": 0.5541794300079346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028222191077903946, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11340163.0, + "repeat_count": 0.0, + "routers_loss": 0.0032896639313548803, + "skip_count": 0.0, + "step": 7032, + "text_loss": 0.5618721842765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00028194333765520853, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11343494.0, + "repeat_count": 1.0, + "routers_loss": 0.005377276800572872, + "skip_count": 0.0, + "step": 7034, + "text_loss": 0.325153648853302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00028166484809110206, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11346126.0, + "repeat_count": 0.0, + "routers_loss": 0.001204605447128415, + "skip_count": 0.0, + "step": 7036, + "text_loss": 0.5016651749610901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00028138644219343736, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 11348879.0, + "repeat_count": 0.0, + "routers_loss": 0.005026837810873985, + "skip_count": 2.0, + "step": 7038, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00028110812006890064, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11352457.0, + "repeat_count": 0.0, + "routers_loss": 0.0019850607495754957, + "skip_count": 0.0, + "step": 7040, + "text_loss": 0.42376917600631714 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00028082988182414524, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11356602.0, + "repeat_count": 1.0, + "routers_loss": 0.003362950636073947, + "skip_count": 2.0, + "step": 7042, + "text_loss": 0.4165397882461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0002805517275657926, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 11359451.0, + "repeat_count": 0.0, + "routers_loss": 0.0019725612364709377, + "skip_count": 1.0, + "step": 7044, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0002802736574004319, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11363614.0, + "repeat_count": 0.0, + "routers_loss": 0.0013963640667498112, + "skip_count": 0.0, + "step": 7046, + "text_loss": 0.6112356185913086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00027999567143462015, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11367015.0, + "repeat_count": 0.0, + "routers_loss": 0.0005658161826431751, + "skip_count": 0.0, + "step": 7048, + "text_loss": 0.4920886754989624 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.09862048723217, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00027971776977488193, + "loss": 0.0064, + "macro_f1": 0.925203263759613, + "num_tokens": 11370489.0, + "repeat_count": 3.0, + "routers_loss": 0.03657131269574165, + "skip_count": 5.0, + "step": 7050, + "text_loss": 0.28003939986228943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00027943995252771017, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11373614.0, + "repeat_count": 0.0, + "routers_loss": 0.004096088465303183, + "skip_count": 2.0, + "step": 7052, + "text_loss": 0.3145081400871277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00027916221979956457, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11377631.0, + "repeat_count": 0.0, + "routers_loss": 0.0009888096246868372, + "skip_count": 0.0, + "step": 7054, + "text_loss": 0.4898056983947754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.126797769298506, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00027888457169687297, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11380620.0, + "repeat_count": 1.0, + "routers_loss": 0.013347696512937546, + "skip_count": 1.0, + "step": 7056, + "text_loss": 0.7011964917182922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027860700832603056, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11383297.0, + "repeat_count": 0.0, + "routers_loss": 0.000849733711220324, + "skip_count": 1.0, + "step": 7058, + "text_loss": 0.4007014334201813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002783295297934003, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11386460.0, + "repeat_count": 0.0, + "routers_loss": 0.001546313869766891, + "skip_count": 1.0, + "step": 7060, + "text_loss": 0.3992713689804077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002780521362053123, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11389605.0, + "repeat_count": 0.0, + "routers_loss": 0.001045585609972477, + "skip_count": 0.0, + "step": 7062, + "text_loss": 0.4440680146217346 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027777482766806446, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 11392105.0, + "repeat_count": 1.0, + "routers_loss": 0.00752411549910903, + "skip_count": 0.0, + "step": 7064, + "text_loss": 0.20152349770069122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 33.17375990607572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002774976042879218, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 11396142.0, + "repeat_count": 0.0, + "routers_loss": 0.019917849451303482, + "skip_count": 3.0, + "step": 7066, + "text_loss": 0.24365149438381195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00027722046617111696, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 11398827.0, + "repeat_count": 1.0, + "routers_loss": 0.0015933843096718192, + "skip_count": 0.0, + "step": 7068, + "text_loss": 0.31948477029800415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00027694341342384977, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11402623.0, + "repeat_count": 0.0, + "routers_loss": 0.0018986845389008522, + "skip_count": 2.0, + "step": 7070, + "text_loss": 0.47721394896507263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00027666644615228727, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11405628.0, + "repeat_count": 0.0, + "routers_loss": 0.002975719515234232, + "skip_count": 1.0, + "step": 7072, + "text_loss": 0.3972358703613281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002763895644625637, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11409468.0, + "repeat_count": 0.0, + "routers_loss": 0.005657708737999201, + "skip_count": 1.0, + "step": 7074, + "text_loss": 0.6004229187965393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002761127684607811, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11412572.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351903203874826, + "skip_count": 2.0, + "step": 7076, + "text_loss": 1.0837591886520386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00027583605825300795, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 11416831.0, + "repeat_count": 2.0, + "routers_loss": 0.005529445596039295, + "skip_count": 2.0, + "step": 7078, + "text_loss": 0.575986921787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00027555943394528014, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11420557.0, + "repeat_count": 0.0, + "routers_loss": 0.006243749521672726, + "skip_count": 0.0, + "step": 7080, + "text_loss": 0.606263279914856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.248899324919286, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00027528289564360064, + "loss": 0.0058, + "macro_f1": 0.6603773832321167, + "num_tokens": 11423471.0, + "repeat_count": 1.0, + "routers_loss": 0.031515009701251984, + "skip_count": 1.0, + "step": 7082, + "text_loss": 0.19393208622932434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002750064434539394, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11426732.0, + "repeat_count": 0.0, + "routers_loss": 0.0005052287015132606, + "skip_count": 0.0, + "step": 7084, + "text_loss": 0.7202399969100952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00027473007748223357, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11429391.0, + "repeat_count": 0.0, + "routers_loss": 0.005099403206259012, + "skip_count": 1.0, + "step": 7086, + "text_loss": 0.20651355385780334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027445379783438685, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11432161.0, + "repeat_count": 0.0, + "routers_loss": 0.001447655027732253, + "skip_count": 0.0, + "step": 7088, + "text_loss": 0.34758952260017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00027417760461627037, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11435417.0, + "repeat_count": 0.0, + "routers_loss": 0.000808655982837081, + "skip_count": 0.0, + "step": 7090, + "text_loss": 0.7414838671684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00027390149793372177, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11438313.0, + "repeat_count": 0.0, + "routers_loss": 0.005151710007339716, + "skip_count": 0.0, + "step": 7092, + "text_loss": 0.17792417109012604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00027362547789254574, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11441681.0, + "repeat_count": 1.0, + "routers_loss": 0.0037353152874857187, + "skip_count": 3.0, + "step": 7094, + "text_loss": 0.5577781796455383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0002733495445985135, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 11444521.0, + "repeat_count": 0.0, + "routers_loss": 0.00038075417978689075, + "skip_count": 0.0, + "step": 7096, + "text_loss": 0.5052862167358398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.32403874376284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002730736981573632, + "loss": 0.0033, + "macro_f1": 0.3272727429866791, + "num_tokens": 11448481.0, + "repeat_count": 0.0, + "routers_loss": 0.007313522044569254, + "skip_count": 1.0, + "step": 7098, + "text_loss": 0.5869139432907104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002727979386748001, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11452164.0, + "repeat_count": 0.0, + "routers_loss": 0.0020673887338489294, + "skip_count": 0.0, + "step": 7100, + "text_loss": 0.4354212284088135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0002725222662564954, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11455995.0, + "repeat_count": 0.0, + "routers_loss": 0.0008315460290759802, + "skip_count": 0.0, + "step": 7102, + "text_loss": 0.8714128732681274 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.35221602582917, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0002722466810080874, + "loss": 0.0053, + "macro_f1": 0.6603773832321167, + "num_tokens": 11458828.0, + "repeat_count": 1.0, + "routers_loss": 0.010913078673183918, + "skip_count": 1.0, + "step": 7104, + "text_loss": 0.6226683855056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002719711830351809, + "loss": 0.0076, + "macro_f1": 0.6603773832321167, + "num_tokens": 11462448.0, + "repeat_count": 1.0, + "routers_loss": 0.040428292006254196, + "skip_count": 1.0, + "step": 7106, + "text_loss": 0.2543688118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027169577244334726, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11465796.0, + "repeat_count": 0.0, + "routers_loss": 0.004473939072340727, + "skip_count": 1.0, + "step": 7108, + "text_loss": 0.12356872111558914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00027142044933812424, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11469176.0, + "repeat_count": 0.0, + "routers_loss": 0.0017961655976250768, + "skip_count": 0.0, + "step": 7110, + "text_loss": 0.6800211668014526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0002711452138250162, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11471983.0, + "repeat_count": 2.0, + "routers_loss": 0.003279087832197547, + "skip_count": 2.0, + "step": 7112, + "text_loss": 0.340279757976532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.3991781626064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00027087006600949403, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11475656.0, + "repeat_count": 1.0, + "routers_loss": 0.017024178057909012, + "skip_count": 1.0, + "step": 7114, + "text_loss": 0.3556337058544159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0002705950059969948, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11479410.0, + "repeat_count": 0.0, + "routers_loss": 0.015487123280763626, + "skip_count": 1.0, + "step": 7116, + "text_loss": 0.4404350817203522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00027032003389292194, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11483302.0, + "repeat_count": 0.0, + "routers_loss": 0.0011217560386285186, + "skip_count": 0.0, + "step": 7118, + "text_loss": 0.46771445870399475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0002700451498026454, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11486212.0, + "repeat_count": 0.0, + "routers_loss": 0.0010832607513293624, + "skip_count": 0.0, + "step": 7120, + "text_loss": 0.6795281767845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00026977035383150106, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11489320.0, + "repeat_count": 0.0, + "routers_loss": 0.002290027216076851, + "skip_count": 1.0, + "step": 7122, + "text_loss": 0.5304523706436157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 33.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00026949564608479164, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11492056.0, + "repeat_count": 2.0, + "routers_loss": 0.009950211271643639, + "skip_count": 6.0, + "step": 7124, + "text_loss": 0.21328973770141602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0002692210266677855, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 11495165.0, + "repeat_count": 0.0, + "routers_loss": 0.0079165268689394, + "skip_count": 3.0, + "step": 7126, + "text_loss": 0.19840657711029053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00026894649568571724, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11497636.0, + "repeat_count": 0.0, + "routers_loss": 0.0013852717820554972, + "skip_count": 0.0, + "step": 7128, + "text_loss": 0.3360055088996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00026867205324378776, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11500806.0, + "repeat_count": 0.0, + "routers_loss": 0.0010151927126571536, + "skip_count": 0.0, + "step": 7130, + "text_loss": 0.6827390193939209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00026839769944716373, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11504187.0, + "repeat_count": 0.0, + "routers_loss": 0.001110393786802888, + "skip_count": 0.0, + "step": 7132, + "text_loss": 0.5081584453582764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002681234344009783, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 11507900.0, + "repeat_count": 0.0, + "routers_loss": 0.010587670840322971, + "skip_count": 1.0, + "step": 7134, + "text_loss": 0.28684356808662415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00026784925821033014, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11510627.0, + "repeat_count": 0.0, + "routers_loss": 0.006658690981566906, + "skip_count": 0.0, + "step": 7136, + "text_loss": 0.24232104420661926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00026757517098028417, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11513304.0, + "repeat_count": 0.0, + "routers_loss": 0.0014556109672412276, + "skip_count": 0.0, + "step": 7138, + "text_loss": 0.4718358516693115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 33.52127971822718, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00026730117281587116, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 11516593.0, + "repeat_count": 1.0, + "routers_loss": 0.01590067707002163, + "skip_count": 3.0, + "step": 7140, + "text_loss": 0.2810344696044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00026702726382208774, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11519776.0, + "repeat_count": 0.0, + "routers_loss": 0.0014479428064078093, + "skip_count": 0.0, + "step": 7142, + "text_loss": 0.48876339197158813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00026675344410389623, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11522499.0, + "repeat_count": 0.0, + "routers_loss": 0.003729258431121707, + "skip_count": 2.0, + "step": 7144, + "text_loss": 0.5350890755653381 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0002664797137662248, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11525220.0, + "repeat_count": 1.0, + "routers_loss": 0.0015156447188928723, + "skip_count": 1.0, + "step": 7146, + "text_loss": 0.5742373466491699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00026620607291396773, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 11527926.0, + "repeat_count": 2.0, + "routers_loss": 0.004842780064791441, + "skip_count": 2.0, + "step": 7148, + "text_loss": 0.4994547665119171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00026593252165198455, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 11531622.0, + "repeat_count": 0.0, + "routers_loss": 0.0026556351222097874, + "skip_count": 0.0, + "step": 7150, + "text_loss": 0.1567893922328949 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00026565906008510064, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11535191.0, + "repeat_count": 0.0, + "routers_loss": 0.008135059848427773, + "skip_count": 1.0, + "step": 7152, + "text_loss": 0.289173424243927 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000265385688318107, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 11539060.0, + "repeat_count": 1.0, + "routers_loss": 0.0020754633005708456, + "skip_count": 1.0, + "step": 7154, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002651124064557602, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11541662.0, + "repeat_count": 1.0, + "routers_loss": 0.0023738413583487272, + "skip_count": 0.0, + "step": 7156, + "text_loss": 0.5026801228523254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00026483921460278227, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11544763.0, + "repeat_count": 0.0, + "routers_loss": 0.003311366541311145, + "skip_count": 1.0, + "step": 7158, + "text_loss": 0.22975654900074005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0002645661128638609, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 11547649.0, + "repeat_count": 0.0, + "routers_loss": 0.0008209354127757251, + "skip_count": 0.0, + "step": 7160, + "text_loss": 0.32840636372566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00026429310134364926, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 11550648.0, + "repeat_count": 0.0, + "routers_loss": 0.0028574815951287746, + "skip_count": 0.0, + "step": 7162, + "text_loss": 0.23239612579345703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00026402018014676584, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11553790.0, + "repeat_count": 0.0, + "routers_loss": 0.005469404626637697, + "skip_count": 1.0, + "step": 7164, + "text_loss": 0.22877025604248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0002637473493777943, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11556802.0, + "repeat_count": 1.0, + "routers_loss": 0.0032242932356894016, + "skip_count": 2.0, + "step": 7166, + "text_loss": 0.6376226544380188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00026347460914128443, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 11559607.0, + "repeat_count": 1.0, + "routers_loss": 0.0040627880953252316, + "skip_count": 2.0, + "step": 7168, + "text_loss": 0.6879657506942749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00026320195954175043, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 11562677.0, + "repeat_count": 2.0, + "routers_loss": 0.020494163036346436, + "skip_count": 4.0, + "step": 7170, + "text_loss": 0.3710069954395294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.00026292940068367224, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11565948.0, + "repeat_count": 0.0, + "routers_loss": 0.002662271959707141, + "skip_count": 0.0, + "step": 7172, + "text_loss": 0.15041157603263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00026265693267149494, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11568836.0, + "repeat_count": 0.0, + "routers_loss": 0.0039914860390126705, + "skip_count": 1.0, + "step": 7174, + "text_loss": 0.5372130870819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.00026238455560962884, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11572542.0, + "repeat_count": 0.0, + "routers_loss": 0.0034708199091255665, + "skip_count": 0.0, + "step": 7176, + "text_loss": 0.2956286072731018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026211226960244914, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11575352.0, + "repeat_count": 0.0, + "routers_loss": 0.007794995326548815, + "skip_count": 2.0, + "step": 7178, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0002618400747542964, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11579110.0, + "repeat_count": 0.0, + "routers_loss": 0.0009694626205600798, + "skip_count": 0.0, + "step": 7180, + "text_loss": 0.6523211598396301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002615679711694764, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11582476.0, + "repeat_count": 0.0, + "routers_loss": 0.004227840341627598, + "skip_count": 1.0, + "step": 7182, + "text_loss": 0.1997286081314087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026129595895225965, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 11585685.0, + "repeat_count": 0.0, + "routers_loss": 0.00126146269030869, + "skip_count": 0.0, + "step": 7184, + "text_loss": 0.486299604177475 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.73730554740241, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002610240382068818, + "loss": 0.006, + "macro_f1": 0.8814815282821655, + "num_tokens": 11588804.0, + "repeat_count": 2.0, + "routers_loss": 0.04553814232349396, + "skip_count": 4.0, + "step": 7186, + "text_loss": 0.1622236669063568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00026075220903754324, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11591822.0, + "repeat_count": 0.0, + "routers_loss": 0.002460496500134468, + "skip_count": 2.0, + "step": 7188, + "text_loss": 0.5573232173919678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002604804715484095, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11594899.0, + "repeat_count": 0.0, + "routers_loss": 0.006854622159153223, + "skip_count": 1.0, + "step": 7190, + "text_loss": 0.4753095507621765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00026020882584361094, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11598333.0, + "repeat_count": 0.0, + "routers_loss": 0.001945660449564457, + "skip_count": 1.0, + "step": 7192, + "text_loss": 0.8912903666496277 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 33.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002599372720272426, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11601814.0, + "repeat_count": 4.0, + "routers_loss": 0.005749753676354885, + "skip_count": 1.0, + "step": 7194, + "text_loss": 0.6041871905326843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002596658102033643, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 11604661.0, + "repeat_count": 0.0, + "routers_loss": 0.0025942171923816204, + "skip_count": 1.0, + "step": 7196, + "text_loss": 0.4760607182979584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 33.793660111535075, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00025939444047600114, + "loss": 0.0075, + "macro_f1": 0.8807588815689087, + "num_tokens": 11608459.0, + "repeat_count": 2.0, + "routers_loss": 0.020141327753663063, + "skip_count": 6.0, + "step": 7198, + "text_loss": 0.6670252084732056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0002591231629491423, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11611489.0, + "repeat_count": 0.0, + "routers_loss": 0.005721202120184898, + "skip_count": 1.0, + "step": 7200, + "text_loss": 0.31318753957748413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00025885197772674174, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11615234.0, + "repeat_count": 0.0, + "routers_loss": 0.0027279339265078306, + "skip_count": 1.0, + "step": 7202, + "text_loss": 0.25728851556777954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00025858088491271825, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11618892.0, + "repeat_count": 0.0, + "routers_loss": 0.0006987092201597989, + "skip_count": 0.0, + "step": 7204, + "text_loss": 0.5504243969917297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00025830988461095504, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11622237.0, + "repeat_count": 0.0, + "routers_loss": 0.0029056845232844353, + "skip_count": 0.0, + "step": 7206, + "text_loss": 0.5319080948829651 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0002580389769253001, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11624713.0, + "repeat_count": 4.0, + "routers_loss": 0.007346974220126867, + "skip_count": 5.0, + "step": 7208, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0002577681619595655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11628689.0, + "repeat_count": 0.0, + "routers_loss": 0.0004166684520896524, + "skip_count": 0.0, + "step": 7210, + "text_loss": 0.37282413244247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00025749743981752824, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11631581.0, + "repeat_count": 0.0, + "routers_loss": 0.013194780796766281, + "skip_count": 2.0, + "step": 7212, + "text_loss": 0.220115065574646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0002572268106029295, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11634503.0, + "repeat_count": 0.0, + "routers_loss": 0.0009112557163462043, + "skip_count": 0.0, + "step": 7214, + "text_loss": 0.5631879568099976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00025695627441947496, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 11637790.0, + "repeat_count": 0.0, + "routers_loss": 0.011178883723914623, + "skip_count": 2.0, + "step": 7216, + "text_loss": 0.24482154846191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.887584385089525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00025668583137083447, + "loss": 0.0047, + "macro_f1": 0.32098764181137085, + "num_tokens": 11640806.0, + "repeat_count": 0.0, + "routers_loss": 0.01877705194056034, + "skip_count": 2.0, + "step": 7218, + "text_loss": 0.2229214459657669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002564154815606422, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11644479.0, + "repeat_count": 0.0, + "routers_loss": 0.0030277224723249674, + "skip_count": 0.0, + "step": 7220, + "text_loss": 0.6025711894035339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00025614522509249715, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11647340.0, + "repeat_count": 0.0, + "routers_loss": 0.002354414900764823, + "skip_count": 1.0, + "step": 7222, + "text_loss": 0.6497155427932739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002558750620699618, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11650433.0, + "repeat_count": 1.0, + "routers_loss": 0.009801039472222328, + "skip_count": 2.0, + "step": 7224, + "text_loss": 0.32049307227134705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002556049925965632, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11654451.0, + "repeat_count": 0.0, + "routers_loss": 0.002949854824692011, + "skip_count": 0.0, + "step": 7226, + "text_loss": 0.17923395335674286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00025533501677579254, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11657440.0, + "repeat_count": 1.0, + "routers_loss": 0.0032915703486651182, + "skip_count": 1.0, + "step": 7228, + "text_loss": 0.60064297914505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0002550651347111049, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11660599.0, + "repeat_count": 1.0, + "routers_loss": 0.00594533933326602, + "skip_count": 1.0, + "step": 7230, + "text_loss": 0.32829397916793823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00025479534650591976, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11663387.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214308466762304, + "skip_count": 0.0, + "step": 7232, + "text_loss": 0.7317177653312683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00025452565226362036, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11666729.0, + "repeat_count": 0.0, + "routers_loss": 0.0056374757550656796, + "skip_count": 2.0, + "step": 7234, + "text_loss": 0.3394623398780823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.00025425605208755406, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11669871.0, + "repeat_count": 0.0, + "routers_loss": 0.006422565318644047, + "skip_count": 3.0, + "step": 7236, + "text_loss": 0.1725512444972992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002539865460810322, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 11673008.0, + "repeat_count": 1.0, + "routers_loss": 0.0023537934757769108, + "skip_count": 0.0, + "step": 7238, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00025371713434733, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11675988.0, + "repeat_count": 0.0, + "routers_loss": 0.0026300614699721336, + "skip_count": 1.0, + "step": 7240, + "text_loss": 0.4877084195613861 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 34.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002534478169896864, + "loss": 0.0052, + "macro_f1": 0.9265305995941162, + "num_tokens": 11679068.0, + "repeat_count": 1.0, + "routers_loss": 0.019549336284399033, + "skip_count": 3.0, + "step": 7242, + "text_loss": 0.15101417899131775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002531785941113044, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11682205.0, + "repeat_count": 0.0, + "routers_loss": 0.007769173942506313, + "skip_count": 1.0, + "step": 7244, + "text_loss": 0.4035153090953827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0002529094658153508, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11685162.0, + "repeat_count": 0.0, + "routers_loss": 0.003636054927483201, + "skip_count": 0.0, + "step": 7246, + "text_loss": 0.21048080921173096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.00025264043220495606, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 11688512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013363865436986089, + "skip_count": 0.0, + "step": 7248, + "text_loss": 0.6582038402557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00025237149338321437, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11691753.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587349878624082, + "skip_count": 0.0, + "step": 7250, + "text_loss": 0.6899203658103943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0002521026494531835, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11694689.0, + "repeat_count": 1.0, + "routers_loss": 0.006221035961061716, + "skip_count": 0.0, + "step": 7252, + "text_loss": 0.17377600073814392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.000251833900517885, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 11697950.0, + "repeat_count": 0.0, + "routers_loss": 0.004368607886135578, + "skip_count": 1.0, + "step": 7254, + "text_loss": 0.4147649109363556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.000251565246680304, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11701214.0, + "repeat_count": 0.0, + "routers_loss": 0.0038269520737230778, + "skip_count": 2.0, + "step": 7256, + "text_loss": 0.42076823115348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00025129668804338906, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11703935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011755652958527207, + "skip_count": 0.0, + "step": 7258, + "text_loss": 0.5484340190887451 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00025102822471005247, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11706818.0, + "repeat_count": 1.0, + "routers_loss": 0.00735129788517952, + "skip_count": 2.0, + "step": 7260, + "text_loss": 0.29214802384376526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00025075985678316983, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 11709979.0, + "repeat_count": 1.0, + "routers_loss": 0.0011552777141332626, + "skip_count": 0.0, + "step": 7262, + "text_loss": 0.6514551639556885 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.10331670090989, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002504915843655802, + "loss": 0.0067, + "macro_f1": 0.8814815282821655, + "num_tokens": 11714075.0, + "repeat_count": 2.0, + "routers_loss": 0.01438678614795208, + "skip_count": 4.0, + "step": 7264, + "text_loss": 0.5144859552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002502234075600862, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11717610.0, + "repeat_count": 0.0, + "routers_loss": 0.0027831171173602343, + "skip_count": 0.0, + "step": 7266, + "text_loss": 0.6494308114051819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00024995532646945336, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 11721415.0, + "repeat_count": 0.0, + "routers_loss": 0.0012327058939263225, + "skip_count": 0.0, + "step": 7268, + "text_loss": 0.5111991763114929 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 34.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002496873411964113, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11724488.0, + "repeat_count": 2.0, + "routers_loss": 0.003060065908357501, + "skip_count": 1.0, + "step": 7270, + "text_loss": 0.5780492424964905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002494194518436523, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11727708.0, + "repeat_count": 0.0, + "routers_loss": 0.001369593315757811, + "skip_count": 0.0, + "step": 7272, + "text_loss": 0.3151950240135193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00024915165851383203, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11730897.0, + "repeat_count": 0.0, + "routers_loss": 0.005724756047129631, + "skip_count": 0.0, + "step": 7274, + "text_loss": 0.5267965197563171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00024888396130956947, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11733870.0, + "repeat_count": 1.0, + "routers_loss": 0.010036137886345387, + "skip_count": 0.0, + "step": 7276, + "text_loss": 0.5330777168273926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00024861636033344657, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11737413.0, + "repeat_count": 0.0, + "routers_loss": 0.008341848850250244, + "skip_count": 2.0, + "step": 7278, + "text_loss": 0.25949522852897644 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0002483488556880087, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 11740691.0, + "repeat_count": 1.0, + "routers_loss": 0.008208763785660267, + "skip_count": 2.0, + "step": 7280, + "text_loss": 0.1867891401052475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000248081447475764, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11743715.0, + "repeat_count": 0.0, + "routers_loss": 0.0038434381131082773, + "skip_count": 0.0, + "step": 7282, + "text_loss": 0.4835410416126251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002478141357991838, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11746818.0, + "repeat_count": 0.0, + "routers_loss": 0.0019067893736064434, + "skip_count": 0.0, + "step": 7284, + "text_loss": 0.5959038734436035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00024754692076070256, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11750160.0, + "repeat_count": 0.0, + "routers_loss": 0.007199060171842575, + "skip_count": 0.0, + "step": 7286, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002472798024627175, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11752836.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214382972568274, + "skip_count": 0.0, + "step": 7288, + "text_loss": 0.5742631554603577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002470127810075889, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11756276.0, + "repeat_count": 0.0, + "routers_loss": 0.0018025166355073452, + "skip_count": 0.0, + "step": 7290, + "text_loss": 0.6616888642311096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00024674585649763983, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11760235.0, + "repeat_count": 1.0, + "routers_loss": 0.0024077212437987328, + "skip_count": 0.0, + "step": 7292, + "text_loss": 0.7984768748283386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024647902903515614, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 11763430.0, + "repeat_count": 0.0, + "routers_loss": 0.007843999192118645, + "skip_count": 1.0, + "step": 7294, + "text_loss": 0.1943647861480713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0002462122987223869, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 11766583.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727738108485937, + "skip_count": 0.0, + "step": 7296, + "text_loss": 0.43924200534820557 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 27.0, + "epoch": 34.26298796595245, + "f1_execute": 0.9545454382896423, + "f1_repeat": 1.0, + "f1_skip": 0.75, + "grad_norm": 0.041015625, + "learning_rate": 0.0002459456656615436, + "loss": 0.0069, + "macro_f1": 0.9015151858329773, + "num_tokens": 11770360.0, + "repeat_count": 2.0, + "routers_loss": 0.04594529792666435, + "skip_count": 5.0, + "step": 7298, + "text_loss": 0.32582250237464905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002456791299548004, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11773239.0, + "repeat_count": 1.0, + "routers_loss": 0.0011880286037921906, + "skip_count": 0.0, + "step": 7300, + "text_loss": 0.7723727226257324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024541269170429435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11776945.0, + "repeat_count": 0.0, + "routers_loss": 0.0010577787179499865, + "skip_count": 0.0, + "step": 7302, + "text_loss": 0.8173839449882507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002451463510121252, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11780121.0, + "repeat_count": 0.0, + "routers_loss": 0.0019757342524826527, + "skip_count": 0.0, + "step": 7304, + "text_loss": 0.4015064239501953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000244880107980355, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 11783172.0, + "repeat_count": 0.0, + "routers_loss": 0.002577328821644187, + "skip_count": 0.0, + "step": 7306, + "text_loss": 0.5465171933174133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00024461396271100876, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 11788608.0, + "repeat_count": 0.0, + "routers_loss": 0.004162502940744162, + "skip_count": 0.0, + "step": 7308, + "text_loss": 0.2419646978378296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0002443479153060735, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11791912.0, + "repeat_count": 0.0, + "routers_loss": 0.003301614662632346, + "skip_count": 0.0, + "step": 7310, + "text_loss": 0.2568489909172058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00024408196586749964, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11794849.0, + "repeat_count": 0.0, + "routers_loss": 0.0019893983844667673, + "skip_count": 0.0, + "step": 7312, + "text_loss": 0.7044196128845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0002438161144971992, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11797587.0, + "repeat_count": 0.0, + "routers_loss": 0.006637922488152981, + "skip_count": 1.0, + "step": 7314, + "text_loss": 0.6863232254981995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000243550361297047, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11800173.0, + "repeat_count": 0.0, + "routers_loss": 0.003078785724937916, + "skip_count": 2.0, + "step": 7316, + "text_loss": 0.2868897616863251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00024328470636888005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11802889.0, + "repeat_count": 0.0, + "routers_loss": 0.0011882453691214323, + "skip_count": 0.0, + "step": 7318, + "text_loss": 0.5522798299789429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0002430191498144979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11805607.0, + "repeat_count": 0.0, + "routers_loss": 0.0008720619371160865, + "skip_count": 0.0, + "step": 7320, + "text_loss": 0.5531370639801025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00024275369173566236, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11808838.0, + "repeat_count": 1.0, + "routers_loss": 0.003213440766558051, + "skip_count": 0.0, + "step": 7322, + "text_loss": 0.5252627730369568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.00024248833223409715, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 11811965.0, + "repeat_count": 0.0, + "routers_loss": 0.004736232105642557, + "skip_count": 1.0, + "step": 7324, + "text_loss": 0.6033701300621033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00024222307141148907, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11814832.0, + "repeat_count": 0.0, + "routers_loss": 0.0007559265359304845, + "skip_count": 0.0, + "step": 7326, + "text_loss": 0.5607737302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00024195790936948626, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11818802.0, + "repeat_count": 0.0, + "routers_loss": 0.005338212475180626, + "skip_count": 2.0, + "step": 7328, + "text_loss": 0.20618735253810883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002416928462096994, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11821998.0, + "repeat_count": 0.0, + "routers_loss": 0.001919696107506752, + "skip_count": 3.0, + "step": 7330, + "text_loss": 0.42486369609832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00024142788203370107, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797834981232882, + "skip_count": 0.0, + "step": 7332, + "text_loss": 0.48403388261795044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00024116301694302621, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 11828504.0, + "repeat_count": 0.0, + "routers_loss": 0.008978237397968769, + "skip_count": 1.0, + "step": 7334, + "text_loss": 0.43872755765914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00024089825103917152, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11831171.0, + "repeat_count": 0.0, + "routers_loss": 0.004589964635670185, + "skip_count": 1.0, + "step": 7336, + "text_loss": 0.5126842260360718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024063358442359572, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11834387.0, + "repeat_count": 0.0, + "routers_loss": 0.002857893006876111, + "skip_count": 0.0, + "step": 7338, + "text_loss": 0.7521272301673889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0002403690171977197, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11838693.0, + "repeat_count": 0.0, + "routers_loss": 0.0009023012826219201, + "skip_count": 0.0, + "step": 7340, + "text_loss": 0.6335242390632629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00024010454946292586, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11841882.0, + "repeat_count": 1.0, + "routers_loss": 0.010992717929184437, + "skip_count": 0.0, + "step": 7342, + "text_loss": 0.64045649766922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002398401813205592, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11845181.0, + "repeat_count": 0.0, + "routers_loss": 0.002247930970042944, + "skip_count": 2.0, + "step": 7344, + "text_loss": 0.31022098660469055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00023957591287192577, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11848537.0, + "repeat_count": 0.0, + "routers_loss": 0.003184020286425948, + "skip_count": 2.0, + "step": 7346, + "text_loss": 0.5709269642829895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00023931174421829376, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 11851437.0, + "repeat_count": 2.0, + "routers_loss": 0.006582654081285, + "skip_count": 4.0, + "step": 7348, + "text_loss": 0.3547070026397705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00023904767546089318, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11854161.0, + "repeat_count": 1.0, + "routers_loss": 0.0022124287206679583, + "skip_count": 0.0, + "step": 7350, + "text_loss": 0.6984702348709106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023878370670091565, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11856811.0, + "repeat_count": 1.0, + "routers_loss": 0.0029868825804442167, + "skip_count": 0.0, + "step": 7352, + "text_loss": 0.25389090180397034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00023851983803951444, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11860110.0, + "repeat_count": 0.0, + "routers_loss": 0.0028468978125602007, + "skip_count": 1.0, + "step": 7354, + "text_loss": 0.5729252099990845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00023825606957780454, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11863058.0, + "repeat_count": 1.0, + "routers_loss": 0.003115740604698658, + "skip_count": 2.0, + "step": 7356, + "text_loss": 0.60753333568573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00023799240141686258, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11865865.0, + "repeat_count": 0.0, + "routers_loss": 0.0022254586219787598, + "skip_count": 0.0, + "step": 7358, + "text_loss": 0.2568866014480591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00023772883365772658, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11869133.0, + "repeat_count": 0.0, + "routers_loss": 0.0017388637643307447, + "skip_count": 0.0, + "step": 7360, + "text_loss": 0.7657097578048706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00023746536640139633, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11872988.0, + "repeat_count": 0.0, + "routers_loss": 0.002158832037821412, + "skip_count": 0.0, + "step": 7362, + "text_loss": 0.19717472791671753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00023720199974883294, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11875810.0, + "repeat_count": 0.0, + "routers_loss": 0.001037398586049676, + "skip_count": 0.0, + "step": 7364, + "text_loss": 0.47334593534469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00023693873380095876, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11878558.0, + "repeat_count": 0.0, + "routers_loss": 0.011853457428514957, + "skip_count": 5.0, + "step": 7366, + "text_loss": 0.2567826211452484 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00023667556865865824, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 11881473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015339091187343001, + "skip_count": 0.0, + "step": 7368, + "text_loss": 0.40981143712997437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00023641250442277655, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11885033.0, + "repeat_count": 1.0, + "routers_loss": 0.010062574408948421, + "skip_count": 0.0, + "step": 7370, + "text_loss": 0.3183043301105499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00023614954119412042, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11889136.0, + "repeat_count": 0.0, + "routers_loss": 0.0010769609361886978, + "skip_count": 0.0, + "step": 7372, + "text_loss": 0.5279555916786194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00023588667907345785, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11893102.0, + "repeat_count": 0.0, + "routers_loss": 0.0032862431835383177, + "skip_count": 3.0, + "step": 7374, + "text_loss": 0.5425930023193359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 34.629292632814796, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0341796875, + "learning_rate": 0.00023562391816151808, + "loss": 0.0057, + "macro_f1": 0.5934640765190125, + "num_tokens": 11895841.0, + "repeat_count": 0.0, + "routers_loss": 0.02405562624335289, + "skip_count": 3.0, + "step": 7376, + "text_loss": 0.26054954528808594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00023536125855899153, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11899594.0, + "repeat_count": 1.0, + "routers_loss": 0.008315852843225002, + "skip_count": 3.0, + "step": 7378, + "text_loss": 0.19068174064159393 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00023509870036652998, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11902843.0, + "repeat_count": 1.0, + "routers_loss": 0.006180883850902319, + "skip_count": 4.0, + "step": 7380, + "text_loss": 0.18461982905864716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023483624368474614, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11905786.0, + "repeat_count": 0.0, + "routers_loss": 0.0008856299100443721, + "skip_count": 0.0, + "step": 7382, + "text_loss": 0.5216618180274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.66686234223657, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00023457388861421397, + "loss": 0.0059, + "macro_f1": 0.32098764181137085, + "num_tokens": 11908706.0, + "repeat_count": 1.0, + "routers_loss": 0.04762765392661095, + "skip_count": 1.0, + "step": 7384, + "text_loss": 0.25329193472862244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00023431163525546833, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11911862.0, + "repeat_count": 1.0, + "routers_loss": 0.000989250373095274, + "skip_count": 1.0, + "step": 7386, + "text_loss": 0.2657507658004761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0002340494837090053, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11915483.0, + "repeat_count": 0.0, + "routers_loss": 0.0008857969660311937, + "skip_count": 0.0, + "step": 7388, + "text_loss": 0.5136669874191284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00023378743407528164, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11918778.0, + "repeat_count": 0.0, + "routers_loss": 0.0041572838090360165, + "skip_count": 1.0, + "step": 7390, + "text_loss": 0.5212553143501282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00023352548645471556, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11921916.0, + "repeat_count": 0.0, + "routers_loss": 0.0010537431808188558, + "skip_count": 0.0, + "step": 7392, + "text_loss": 0.48122525215148926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00023326364094768576, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11924273.0, + "repeat_count": 1.0, + "routers_loss": 0.004077036865055561, + "skip_count": 0.0, + "step": 7394, + "text_loss": 0.2128690630197525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00023300189765453194, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11927424.0, + "repeat_count": 0.0, + "routers_loss": 0.005371362902224064, + "skip_count": 2.0, + "step": 7396, + "text_loss": 0.19448284804821014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00023274025667555464, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11930919.0, + "repeat_count": 0.0, + "routers_loss": 0.002137752715498209, + "skip_count": 0.0, + "step": 7398, + "text_loss": 0.7537064552307129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00023247871811101512, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11933680.0, + "repeat_count": 0.0, + "routers_loss": 0.0002398790093138814, + "skip_count": 0.0, + "step": 7400, + "text_loss": 0.5589297413825989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.751394188435576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00023221728206113546, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 11937090.0, + "repeat_count": 0.0, + "routers_loss": 0.019718777388334274, + "skip_count": 1.0, + "step": 7402, + "text_loss": 0.8014751672744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0002319559486260985, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11940581.0, + "repeat_count": 0.0, + "routers_loss": 0.001230534864589572, + "skip_count": 0.0, + "step": 7404, + "text_loss": 0.5218383073806763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002316947179060477, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11943832.0, + "repeat_count": 0.0, + "routers_loss": 0.0016393321566283703, + "skip_count": 0.0, + "step": 7406, + "text_loss": 0.17122556269168854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023143359000108704, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11947025.0, + "repeat_count": 0.0, + "routers_loss": 0.005269679240882397, + "skip_count": 2.0, + "step": 7408, + "text_loss": 0.2015499323606491 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00023117256501128136, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11950077.0, + "repeat_count": 1.0, + "routers_loss": 0.005140089895576239, + "skip_count": 2.0, + "step": 7410, + "text_loss": 0.39068636298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00023091164303665592, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11953800.0, + "repeat_count": 0.0, + "routers_loss": 0.005578748416155577, + "skip_count": 0.0, + "step": 7412, + "text_loss": 0.18851874768733978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.00023065082417719624, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11956383.0, + "repeat_count": 0.0, + "routers_loss": 0.0006410991190932691, + "skip_count": 0.0, + "step": 7414, + "text_loss": 0.5663703083992004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0002303901085328491, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11959554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005902954144403338, + "skip_count": 5.0, + "step": 7416, + "text_loss": 0.5225661993026733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0002301294962035209, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11962582.0, + "repeat_count": 0.0, + "routers_loss": 0.00045644037891179323, + "skip_count": 0.0, + "step": 7418, + "text_loss": 0.40572360157966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0002298689872890789, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11965649.0, + "repeat_count": 0.0, + "routers_loss": 0.01017778366804123, + "skip_count": 2.0, + "step": 7420, + "text_loss": 0.12190715968608856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00022960858188935052, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11968850.0, + "repeat_count": 0.0, + "routers_loss": 0.0008010792662389576, + "skip_count": 0.0, + "step": 7422, + "text_loss": 0.5606820583343506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0002293482801041236, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11972064.0, + "repeat_count": 0.0, + "routers_loss": 0.001889281440526247, + "skip_count": 0.0, + "step": 7424, + "text_loss": 0.44142210483551025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00022908808203314635, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11975466.0, + "repeat_count": 0.0, + "routers_loss": 0.00647713290527463, + "skip_count": 2.0, + "step": 7426, + "text_loss": 0.23273423314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002288279877761271, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 11979875.0, + "repeat_count": 0.0, + "routers_loss": 0.004027119372040033, + "skip_count": 0.0, + "step": 7428, + "text_loss": 0.5608086585998535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0002285679974327345, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11982808.0, + "repeat_count": 0.0, + "routers_loss": 0.0009015435934998095, + "skip_count": 0.0, + "step": 7430, + "text_loss": 0.3976539373397827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002283081111025973, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11985978.0, + "repeat_count": 0.0, + "routers_loss": 0.00047143330448307097, + "skip_count": 0.0, + "step": 7432, + "text_loss": 0.4280148446559906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022804832888530447, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11988925.0, + "repeat_count": 0.0, + "routers_loss": 0.0004895820748060942, + "skip_count": 0.0, + "step": 7434, + "text_loss": 0.5137463808059692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000227788650880405, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11991631.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349024574272335, + "skip_count": 0.0, + "step": 7436, + "text_loss": 0.4306720197200775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00022752907718740807, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 11995476.0, + "repeat_count": 0.0, + "routers_loss": 0.0038723985198885202, + "skip_count": 0.0, + "step": 7438, + "text_loss": 0.6413722038269043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00022726960790578248, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 11998846.0, + "repeat_count": 1.0, + "routers_loss": 0.004433541093021631, + "skip_count": 0.0, + "step": 7440, + "text_loss": 0.6424159407615662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 34.93924273554447, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002270102431349579, + "loss": 0.0062, + "macro_f1": 0.6289562582969666, + "num_tokens": 12002228.0, + "repeat_count": 0.0, + "routers_loss": 0.023979803547263145, + "skip_count": 6.0, + "step": 7442, + "text_loss": 0.16657918691635132 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 34.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022675098297432307, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12005003.0, + "repeat_count": 3.0, + "routers_loss": 0.005645833443850279, + "skip_count": 1.0, + "step": 7444, + "text_loss": 0.6388722658157349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022649182752322705, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12007657.0, + "repeat_count": 0.0, + "routers_loss": 0.001629356062039733, + "skip_count": 2.0, + "step": 7446, + "text_loss": 0.35670006275177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00022623277688097864, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12010652.0, + "repeat_count": 0.0, + "routers_loss": 0.006375396624207497, + "skip_count": 2.0, + "step": 7448, + "text_loss": 0.24273613095283508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0002259738311468466, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12014042.0, + "repeat_count": 0.0, + "routers_loss": 0.003734540194272995, + "skip_count": 0.0, + "step": 7450, + "text_loss": 0.4262580871582031 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0002257149904200592, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 12016987.0, + "repeat_count": 1.0, + "routers_loss": 0.0027926203329116106, + "skip_count": 1.0, + "step": 7452, + "text_loss": 0.366216778755188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00022545625479980508, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 12021584.0, + "repeat_count": 0.0, + "routers_loss": 0.0008985420572571456, + "skip_count": 0.0, + "step": 7454, + "text_loss": 0.533937394618988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00022519762438523205, + "loss": 0.0029, + "macro_f1": 0.6666666865348816, + "num_tokens": 12024142.0, + "repeat_count": 0.0, + "routers_loss": 0.005394646432250738, + "skip_count": 1.0, + "step": 7456, + "text_loss": 0.2401239275932312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0002249390992754477, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12027262.0, + "repeat_count": 0.0, + "routers_loss": 0.00275063537992537, + "skip_count": 0.0, + "step": 7458, + "text_loss": 0.21824975311756134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00022468067956951944, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12030528.0, + "repeat_count": 0.0, + "routers_loss": 0.0008951274212449789, + "skip_count": 1.0, + "step": 7460, + "text_loss": 0.610903263092041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00022442236536647408, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12033699.0, + "repeat_count": 0.0, + "routers_loss": 0.004062872380018234, + "skip_count": 2.0, + "step": 7462, + "text_loss": 0.26921433210372925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00022416415676529823, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12037402.0, + "repeat_count": 0.0, + "routers_loss": 0.0023089025635272264, + "skip_count": 1.0, + "step": 7464, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00022390605386493756, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12041129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021355501376092434, + "skip_count": 2.0, + "step": 7466, + "text_loss": 0.4265538454055786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00022364805676429816, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 12044356.0, + "repeat_count": 0.0, + "routers_loss": 0.0061582159250974655, + "skip_count": 1.0, + "step": 7468, + "text_loss": 0.12020833045244217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00022339016556224467, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 12047158.0, + "repeat_count": 0.0, + "routers_loss": 0.003753372235223651, + "skip_count": 1.0, + "step": 7470, + "text_loss": 0.6406939625740051 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022313238035760158, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12050149.0, + "repeat_count": 1.0, + "routers_loss": 0.005371729377657175, + "skip_count": 5.0, + "step": 7472, + "text_loss": 0.5184400677680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002228747012491526, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12053560.0, + "repeat_count": 0.0, + "routers_loss": 0.000824139395263046, + "skip_count": 0.0, + "step": 7474, + "text_loss": 0.32644152641296387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002226171283356409, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12056309.0, + "repeat_count": 0.0, + "routers_loss": 0.0044801668263971806, + "skip_count": 1.0, + "step": 7476, + "text_loss": 0.7027081847190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00022235966171576887, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12059191.0, + "repeat_count": 0.0, + "routers_loss": 0.007496353704482317, + "skip_count": 2.0, + "step": 7478, + "text_loss": 0.28705671429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0002221023014881982, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12062365.0, + "repeat_count": 0.0, + "routers_loss": 0.0018641395727172494, + "skip_count": 1.0, + "step": 7480, + "text_loss": 0.715477466583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00022184504775154984, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12065508.0, + "repeat_count": 0.0, + "routers_loss": 0.0005825075786560774, + "skip_count": 0.0, + "step": 7482, + "text_loss": 0.7481293678283691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00022158790060440394, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12068043.0, + "repeat_count": 0.0, + "routers_loss": 0.0028906071092933416, + "skip_count": 0.0, + "step": 7484, + "text_loss": 0.6151962876319885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00022133086014529968, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12070897.0, + "repeat_count": 0.0, + "routers_loss": 0.0030862605199217796, + "skip_count": 1.0, + "step": 7486, + "text_loss": 0.4923575222492218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00022107392647273527, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 12074644.0, + "repeat_count": 0.0, + "routers_loss": 0.0011101154377683997, + "skip_count": 0.0, + "step": 7488, + "text_loss": 0.5217859148979187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00022081709968516867, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12077718.0, + "repeat_count": 0.0, + "routers_loss": 0.004303969442844391, + "skip_count": 0.0, + "step": 7490, + "text_loss": 0.18933317065238953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00022056037988101612, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12080509.0, + "repeat_count": 0.0, + "routers_loss": 0.0019941304344683886, + "skip_count": 1.0, + "step": 7492, + "text_loss": 0.6760565042495728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.00022030376715865313, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12083580.0, + "repeat_count": 0.0, + "routers_loss": 0.0017090907786041498, + "skip_count": 0.0, + "step": 7494, + "text_loss": 0.4140956401824951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002200472616164142, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12086923.0, + "repeat_count": 0.0, + "routers_loss": 0.005131757352501154, + "skip_count": 1.0, + "step": 7496, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00021979086335259269, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12090003.0, + "repeat_count": 0.0, + "routers_loss": 0.0007472267607226968, + "skip_count": 0.0, + "step": 7498, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021953457246544095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12092936.0, + "repeat_count": 0.0, + "routers_loss": 0.0012374494690448046, + "skip_count": 0.0, + "step": 7500, + "text_loss": 0.5170100331306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00021927838905317016, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12096395.0, + "repeat_count": 0.0, + "routers_loss": 0.006784295197576284, + "skip_count": 2.0, + "step": 7502, + "text_loss": 0.340880811214447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00021902231321395017, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12099743.0, + "repeat_count": 0.0, + "routers_loss": 0.0058755455538630486, + "skip_count": 1.0, + "step": 7504, + "text_loss": 0.5299809575080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021876634504590985, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12103121.0, + "repeat_count": 0.0, + "routers_loss": 0.010622406378388405, + "skip_count": 2.0, + "step": 7506, + "text_loss": 0.1817338913679123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00021851048464713662, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12105883.0, + "repeat_count": 0.0, + "routers_loss": 0.004382388666272163, + "skip_count": 3.0, + "step": 7508, + "text_loss": 0.5718557834625244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00021825473211567665, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12108936.0, + "repeat_count": 0.0, + "routers_loss": 0.001638208981603384, + "skip_count": 0.0, + "step": 7510, + "text_loss": 0.4684678316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00021799908754953468, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12112060.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894381997175515, + "skip_count": 2.0, + "step": 7512, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00021774355104667455, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12115636.0, + "repeat_count": 0.0, + "routers_loss": 0.01400370616465807, + "skip_count": 2.0, + "step": 7514, + "text_loss": 0.19512294232845306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021748812270501805, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12119116.0, + "repeat_count": 0.0, + "routers_loss": 0.005261222366243601, + "skip_count": 3.0, + "step": 7516, + "text_loss": 0.17316904664039612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0002172328026224459, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12122070.0, + "repeat_count": 0.0, + "routers_loss": 0.01021486520767212, + "skip_count": 2.0, + "step": 7518, + "text_loss": 0.2777172029018402 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00021697759089679713, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 12125386.0, + "repeat_count": 2.0, + "routers_loss": 0.005217147525399923, + "skip_count": 2.0, + "step": 7520, + "text_loss": 0.49744322896003723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00021672248762586948, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12128753.0, + "repeat_count": 0.0, + "routers_loss": 0.003868246916681528, + "skip_count": 0.0, + "step": 7522, + "text_loss": 0.4209211468696594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.32403874376284, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00021646749290741895, + "loss": 0.009, + "macro_f1": 0.6598639488220215, + "num_tokens": 12132425.0, + "repeat_count": 1.0, + "routers_loss": 0.044205982238054276, + "skip_count": 3.0, + "step": 7524, + "text_loss": 0.4180344343185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00021621260683916005, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12135740.0, + "repeat_count": 0.0, + "routers_loss": 0.0032584366854280233, + "skip_count": 2.0, + "step": 7526, + "text_loss": 0.21219655871391296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00021595782951876552, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12139239.0, + "repeat_count": 0.0, + "routers_loss": 0.002418758114799857, + "skip_count": 2.0, + "step": 7528, + "text_loss": 0.40800613164901733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0002157031610438665, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12142572.0, + "repeat_count": 1.0, + "routers_loss": 0.005265383515506983, + "skip_count": 1.0, + "step": 7530, + "text_loss": 0.7539705634117126 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002154486015120525, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 12145737.0, + "repeat_count": 1.0, + "routers_loss": 0.006648020353168249, + "skip_count": 2.0, + "step": 7532, + "text_loss": 0.7824432253837585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002151941510208712, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 12149376.0, + "repeat_count": 1.0, + "routers_loss": 0.01692759431898594, + "skip_count": 0.0, + "step": 7534, + "text_loss": 0.4476291239261627 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0002149398096678283, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12152191.0, + "repeat_count": 1.0, + "routers_loss": 0.013883143663406372, + "skip_count": 0.0, + "step": 7536, + "text_loss": 0.14996720850467682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021468557755038826, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12155084.0, + "repeat_count": 2.0, + "routers_loss": 0.009390740655362606, + "skip_count": 2.0, + "step": 7538, + "text_loss": 0.23685340583324432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0002144314547659731, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12159366.0, + "repeat_count": 0.0, + "routers_loss": 0.0025363171007484198, + "skip_count": 0.0, + "step": 7540, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021417744141196315, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12162545.0, + "repeat_count": 0.0, + "routers_loss": 0.004230613354593515, + "skip_count": 1.0, + "step": 7542, + "text_loss": 0.24885894358158112 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00021392353758569694, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12165381.0, + "repeat_count": 1.0, + "routers_loss": 0.008058524690568447, + "skip_count": 0.0, + "step": 7544, + "text_loss": 0.15833988785743713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002136697433844707, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12168304.0, + "repeat_count": 0.0, + "routers_loss": 0.0018041770672425628, + "skip_count": 0.0, + "step": 7546, + "text_loss": 0.6046217083930969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00021341605890553894, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12171040.0, + "repeat_count": 1.0, + "routers_loss": 0.008584463968873024, + "skip_count": 2.0, + "step": 7548, + "text_loss": 0.3001522719860077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021316248424611408, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12174702.0, + "repeat_count": 0.0, + "routers_loss": 0.0010506469989195466, + "skip_count": 0.0, + "step": 7550, + "text_loss": 0.2998376488685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.00021290901950336627, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12178388.0, + "repeat_count": 0.0, + "routers_loss": 0.0012753128539770842, + "skip_count": 0.0, + "step": 7552, + "text_loss": 0.8125656843185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00021265566477442384, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12181863.0, + "repeat_count": 0.0, + "routers_loss": 0.004343052394688129, + "skip_count": 2.0, + "step": 7554, + "text_loss": 0.14004671573638916 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00021240242015637268, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12185485.0, + "repeat_count": 1.0, + "routers_loss": 0.0005794052849523723, + "skip_count": 0.0, + "step": 7556, + "text_loss": 0.7116519808769226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.4837100088054, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00021214928574625664, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 12188914.0, + "repeat_count": 1.0, + "routers_loss": 0.01066325418651104, + "skip_count": 0.0, + "step": 7558, + "text_loss": 0.4664429724216461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021189626164107718, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12193042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011769415577873588, + "skip_count": 0.0, + "step": 7560, + "text_loss": 0.672637403011322 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00021164334793779388, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12195675.0, + "repeat_count": 1.0, + "routers_loss": 0.008653911761939526, + "skip_count": 1.0, + "step": 7562, + "text_loss": 0.5301182866096497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00021139054473332357, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 12198638.0, + "repeat_count": 0.0, + "routers_loss": 0.0058176578022539616, + "skip_count": 0.0, + "step": 7564, + "text_loss": 0.1889677792787552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000211137852124541, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12202312.0, + "repeat_count": 0.0, + "routers_loss": 0.0004154018242843449, + "skip_count": 0.0, + "step": 7566, + "text_loss": 0.3610386848449707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00021088527020827848, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12205112.0, + "repeat_count": 0.0, + "routers_loss": 0.0014722816413268447, + "skip_count": 0.0, + "step": 7568, + "text_loss": 0.15214823186397552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0002106327990813257, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12208103.0, + "repeat_count": 0.0, + "routers_loss": 0.0015596678713336587, + "skip_count": 0.0, + "step": 7570, + "text_loss": 0.5034125447273254 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00021038043884043022, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12211208.0, + "repeat_count": 1.0, + "routers_loss": 0.007482443004846573, + "skip_count": 0.0, + "step": 7572, + "text_loss": 0.6760116219520569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00021012818958229696, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 12214463.0, + "repeat_count": 0.0, + "routers_loss": 0.003875598544254899, + "skip_count": 2.0, + "step": 7574, + "text_loss": 0.3278147876262665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00020987605140358824, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12218199.0, + "repeat_count": 0.0, + "routers_loss": 0.007918627932667732, + "skip_count": 2.0, + "step": 7576, + "text_loss": 0.23850615322589874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00020962402440092388, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12221151.0, + "repeat_count": 0.0, + "routers_loss": 0.005424308590590954, + "skip_count": 1.0, + "step": 7578, + "text_loss": 0.5670642256736755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002093721086708812, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12224789.0, + "repeat_count": 1.0, + "routers_loss": 0.0066504343412816525, + "skip_count": 1.0, + "step": 7580, + "text_loss": 0.30404478311538696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00020912030430999452, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12228134.0, + "repeat_count": 1.0, + "routers_loss": 0.008815597742795944, + "skip_count": 0.0, + "step": 7582, + "text_loss": 0.32522889971733093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.60581156442618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05126953125, + "learning_rate": 0.0002088686114147561, + "loss": 0.0098, + "macro_f1": 0.5492662787437439, + "num_tokens": 12231335.0, + "repeat_count": 0.0, + "routers_loss": 0.03785836696624756, + "skip_count": 2.0, + "step": 7584, + "text_loss": 0.6277920603752136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00020861703008161504, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12234619.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183801926672459, + "skip_count": 0.0, + "step": 7586, + "text_loss": 0.38319316506385803 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00020836556040697767, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12237296.0, + "repeat_count": 1.0, + "routers_loss": 0.013077575713396072, + "skip_count": 1.0, + "step": 7588, + "text_loss": 0.297571063041687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00020811420248720769, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12240633.0, + "repeat_count": 0.0, + "routers_loss": 0.002858756808564067, + "skip_count": 0.0, + "step": 7590, + "text_loss": 0.2506035268306732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.000207862956418626, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12244118.0, + "repeat_count": 0.0, + "routers_loss": 0.0032624071463942528, + "skip_count": 1.0, + "step": 7592, + "text_loss": 0.19843827188014984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.00020761182229751045, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 12247367.0, + "repeat_count": 1.0, + "routers_loss": 0.005885142367333174, + "skip_count": 3.0, + "step": 7594, + "text_loss": 0.3347153067588806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 35.66216612855885, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020736080022009602, + "loss": 0.0088, + "macro_f1": 0.9452888369560242, + "num_tokens": 12250487.0, + "repeat_count": 1.0, + "routers_loss": 0.021491389721632004, + "skip_count": 4.0, + "step": 7596, + "text_loss": 0.6777212619781494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.671558555914295, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00020710989028257514, + "loss": 0.0061, + "macro_f1": 0.6595745086669922, + "num_tokens": 12253834.0, + "repeat_count": 1.0, + "routers_loss": 0.014164486899971962, + "skip_count": 4.0, + "step": 7598, + "text_loss": 0.741127610206604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0002068590925810968, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12257289.0, + "repeat_count": 0.0, + "routers_loss": 0.0012773120542988181, + "skip_count": 0.0, + "step": 7600, + "text_loss": 0.5336982607841492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002066084072117672, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12260825.0, + "repeat_count": 0.0, + "routers_loss": 0.013102042488753796, + "skip_count": 2.0, + "step": 7602, + "text_loss": 0.30410775542259216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00020635783427064942, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12264609.0, + "repeat_count": 0.0, + "routers_loss": 0.002602101070806384, + "skip_count": 0.0, + "step": 7604, + "text_loss": 0.29835572838783264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020610737385376348, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12267537.0, + "repeat_count": 0.0, + "routers_loss": 0.0053265830501914024, + "skip_count": 0.0, + "step": 7606, + "text_loss": 0.2095658779144287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00020585702605708628, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12271175.0, + "repeat_count": 0.0, + "routers_loss": 0.000614096992649138, + "skip_count": 0.0, + "step": 7608, + "text_loss": 0.8146751523017883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020560679097655137, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12274067.0, + "repeat_count": 0.0, + "routers_loss": 0.0013201923575252295, + "skip_count": 0.0, + "step": 7610, + "text_loss": 0.40818271040916443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002053566687080497, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12276946.0, + "repeat_count": 0.0, + "routers_loss": 0.004304401110857725, + "skip_count": 1.0, + "step": 7612, + "text_loss": 0.7063660025596619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0002051066593474284, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 12279760.0, + "repeat_count": 0.0, + "routers_loss": 0.0032060579396784306, + "skip_count": 1.0, + "step": 7614, + "text_loss": 0.23671887814998627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00020485676299049154, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12282737.0, + "repeat_count": 0.0, + "routers_loss": 0.005103024188429117, + "skip_count": 2.0, + "step": 7616, + "text_loss": 0.17571020126342773 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00020460697973299986, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 12286290.0, + "repeat_count": 1.0, + "routers_loss": 0.007189507596194744, + "skip_count": 1.0, + "step": 7618, + "text_loss": 0.30872994661331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002043573096706708, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 12289458.0, + "repeat_count": 0.0, + "routers_loss": 0.0010217712260782719, + "skip_count": 0.0, + "step": 7620, + "text_loss": 0.5155487060546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002041077528991784, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12292846.0, + "repeat_count": 0.0, + "routers_loss": 0.0022399788722395897, + "skip_count": 1.0, + "step": 7622, + "text_loss": 0.717949390411377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0002038583095141532, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12295673.0, + "repeat_count": 0.0, + "routers_loss": 0.0018168877577409148, + "skip_count": 0.0, + "step": 7624, + "text_loss": 0.560361385345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00020360897961118246, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12298624.0, + "repeat_count": 0.0, + "routers_loss": 0.0008487844606861472, + "skip_count": 0.0, + "step": 7626, + "text_loss": 0.6391524076461792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00020335976328580984, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 12302136.0, + "repeat_count": 0.0, + "routers_loss": 0.0006127831293269992, + "skip_count": 0.0, + "step": 7628, + "text_loss": 0.5932226777076721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.00020311066063353556, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 12305152.0, + "repeat_count": 0.0, + "routers_loss": 0.0018765819258987904, + "skip_count": 0.0, + "step": 7630, + "text_loss": 0.37831631302833557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020286167174981618, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12307771.0, + "repeat_count": 0.0, + "routers_loss": 0.0025384656619280577, + "skip_count": 0.0, + "step": 7632, + "text_loss": 0.34806445240974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0002026127967300645, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12310921.0, + "repeat_count": 0.0, + "routers_loss": 0.008239032700657845, + "skip_count": 2.0, + "step": 7634, + "text_loss": 0.34859901666641235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00020236403566965027, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12314200.0, + "repeat_count": 0.0, + "routers_loss": 0.0029505928978323936, + "skip_count": 2.0, + "step": 7636, + "text_loss": 0.2647531032562256 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0002021153886638991, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12319221.0, + "repeat_count": 1.0, + "routers_loss": 0.0014016951899975538, + "skip_count": 0.0, + "step": 7638, + "text_loss": 0.42428603768348694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.86879953037863, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04248046875, + "learning_rate": 0.00020186685580809288, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 12322204.0, + "repeat_count": 0.0, + "routers_loss": 0.01761031709611416, + "skip_count": 2.0, + "step": 7640, + "text_loss": 0.25929757952690125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00020161843719746997, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12324750.0, + "repeat_count": 0.0, + "routers_loss": 0.0023674629628658295, + "skip_count": 0.0, + "step": 7642, + "text_loss": 0.567159116268158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0002013701329272248, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12327933.0, + "repeat_count": 0.0, + "routers_loss": 0.004534341394901276, + "skip_count": 0.0, + "step": 7644, + "text_loss": 0.4765215516090393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00020112194309250797, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12330847.0, + "repeat_count": 0.0, + "routers_loss": 0.003144246758893132, + "skip_count": 2.0, + "step": 7646, + "text_loss": 0.39837369322776794 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00020087386778842642, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 12333782.0, + "repeat_count": 1.0, + "routers_loss": 0.008137194439768791, + "skip_count": 1.0, + "step": 7648, + "text_loss": 0.42175763845443726 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00020062590711004296, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 12336837.0, + "repeat_count": 1.0, + "routers_loss": 0.006499455776065588, + "skip_count": 1.0, + "step": 7650, + "text_loss": 0.18695278465747833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00020037806115237667, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12340414.0, + "repeat_count": 0.0, + "routers_loss": 0.001548365456983447, + "skip_count": 0.0, + "step": 7652, + "text_loss": 0.1981094628572464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00020013033001040255, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 12343209.0, + "repeat_count": 0.0, + "routers_loss": 0.008136926218867302, + "skip_count": 2.0, + "step": 7654, + "text_loss": 0.2231602668762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00019988271377905165, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12346158.0, + "repeat_count": 0.0, + "routers_loss": 0.00370375020429492, + "skip_count": 1.0, + "step": 7656, + "text_loss": 0.4809921383857727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00019963521255321077, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 12349279.0, + "repeat_count": 0.0, + "routers_loss": 0.00690054427832365, + "skip_count": 3.0, + "step": 7658, + "text_loss": 0.40473970770835876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001993878264277233, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 12352848.0, + "repeat_count": 1.0, + "routers_loss": 0.004367961548268795, + "skip_count": 1.0, + "step": 7660, + "text_loss": 0.3646799921989441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00019914055549738775, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12356737.0, + "repeat_count": 0.0, + "routers_loss": 0.000662159756757319, + "skip_count": 0.0, + "step": 7662, + "text_loss": 0.3703214228153229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001988933998569589, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12360085.0, + "repeat_count": 0.0, + "routers_loss": 0.0023262565955519676, + "skip_count": 0.0, + "step": 7664, + "text_loss": 0.12910836935043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0001986463596011473, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12363296.0, + "repeat_count": 0.0, + "routers_loss": 0.002686078194528818, + "skip_count": 1.0, + "step": 7666, + "text_loss": 0.39628392457962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00019839943482461914, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12366072.0, + "repeat_count": 0.0, + "routers_loss": 0.007100159768015146, + "skip_count": 1.0, + "step": 7668, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00019815262562199648, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12368940.0, + "repeat_count": 0.0, + "routers_loss": 0.004194926470518112, + "skip_count": 0.0, + "step": 7670, + "text_loss": 0.36411619186401367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00019790593208785713, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12372031.0, + "repeat_count": 0.0, + "routers_loss": 0.0041313013061881065, + "skip_count": 0.0, + "step": 7672, + "text_loss": 0.23270413279533386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019765935431673444, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12375115.0, + "repeat_count": 1.0, + "routers_loss": 0.003343774238601327, + "skip_count": 0.0, + "step": 7674, + "text_loss": 0.1686355322599411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 36.03756970942178, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019741289240311755, + "loss": 0.0058, + "macro_f1": 0.6122449040412903, + "num_tokens": 12379089.0, + "repeat_count": 0.0, + "routers_loss": 0.021328814327716827, + "skip_count": 4.0, + "step": 7676, + "text_loss": 0.9312577247619629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00019716654644145104, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12383115.0, + "repeat_count": 0.0, + "routers_loss": 0.0004511173174250871, + "skip_count": 0.0, + "step": 7678, + "text_loss": 0.3305695056915283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00019692031652613522, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12386064.0, + "repeat_count": 0.0, + "routers_loss": 0.006190002430230379, + "skip_count": 0.0, + "step": 7680, + "text_loss": 0.4829687178134918 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00019667420275152575, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 12389743.0, + "repeat_count": 2.0, + "routers_loss": 0.004575030412524939, + "skip_count": 1.0, + "step": 7682, + "text_loss": 0.5751548409461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001964282052119341, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12392481.0, + "repeat_count": 0.0, + "routers_loss": 0.002718796720728278, + "skip_count": 0.0, + "step": 7684, + "text_loss": 0.5349925756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001961823240016269, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 12395207.0, + "repeat_count": 0.0, + "routers_loss": 0.0027528523933142424, + "skip_count": 0.0, + "step": 7686, + "text_loss": 0.5322592258453369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00019593655921482624, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12398232.0, + "repeat_count": 1.0, + "routers_loss": 0.008105970919132233, + "skip_count": 0.0, + "step": 7688, + "text_loss": 0.3192061185836792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.10331670090989, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019569091094570967, + "loss": 0.0069, + "macro_f1": 0.6603773832321167, + "num_tokens": 12400862.0, + "repeat_count": 1.0, + "routers_loss": 0.024075545370578766, + "skip_count": 1.0, + "step": 7690, + "text_loss": 0.3189752697944641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001954453792884101, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12404039.0, + "repeat_count": 0.0, + "routers_loss": 0.007513802964240313, + "skip_count": 3.0, + "step": 7692, + "text_loss": 0.5985093712806702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001951999643370157, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12407085.0, + "repeat_count": 1.0, + "routers_loss": 0.009606506675481796, + "skip_count": 2.0, + "step": 7694, + "text_loss": 0.2050790935754776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00019495466618556996, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12411377.0, + "repeat_count": 0.0, + "routers_loss": 0.0007978329667821527, + "skip_count": 0.0, + "step": 7696, + "text_loss": 0.4705570638179779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019470948492807154, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12414427.0, + "repeat_count": 0.0, + "routers_loss": 0.0010737364646047354, + "skip_count": 0.0, + "step": 7698, + "text_loss": 0.6105324029922485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019446442065847448, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12417442.0, + "repeat_count": 0.0, + "routers_loss": 0.001762967323884368, + "skip_count": 0.0, + "step": 7700, + "text_loss": 0.5638618469238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00019421947347068774, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12420862.0, + "repeat_count": 0.0, + "routers_loss": 0.0015798417152836919, + "skip_count": 0.0, + "step": 7702, + "text_loss": 0.1939864307641983 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00019397464345857562, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12423876.0, + "repeat_count": 0.0, + "routers_loss": 0.005659835878759623, + "skip_count": 1.0, + "step": 7704, + "text_loss": 0.20829300582408905 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.00019372993071595723, + "loss": 0.0072, + "macro_f1": 0.9449735879898071, + "num_tokens": 12427639.0, + "repeat_count": 4.0, + "routers_loss": 0.018665846437215805, + "skip_count": 2.0, + "step": 7706, + "text_loss": 0.47913849353790283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00019348533533660727, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 12431520.0, + "repeat_count": 0.0, + "routers_loss": 0.0006690093432553113, + "skip_count": 0.0, + "step": 7708, + "text_loss": 0.494870662689209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00019324085741425511, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 12434213.0, + "repeat_count": 0.0, + "routers_loss": 0.004067352041602135, + "skip_count": 1.0, + "step": 7710, + "text_loss": 0.7631711959838867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00019299649704258504, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12437437.0, + "repeat_count": 2.0, + "routers_loss": 0.01157623715698719, + "skip_count": 0.0, + "step": 7712, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0001927522543152364, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12440507.0, + "repeat_count": 0.0, + "routers_loss": 0.001888492377474904, + "skip_count": 0.0, + "step": 7714, + "text_loss": 0.576301097869873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019250812932580352, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 12443484.0, + "repeat_count": 0.0, + "routers_loss": 0.00042988534551113844, + "skip_count": 0.0, + "step": 7716, + "text_loss": 0.5716445446014404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00019226412216783557, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12446460.0, + "repeat_count": 0.0, + "routers_loss": 0.005063199903815985, + "skip_count": 1.0, + "step": 7718, + "text_loss": 0.2700924873352051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001920202329348365, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 12449346.0, + "repeat_count": 0.0, + "routers_loss": 0.0010775640839710832, + "skip_count": 0.0, + "step": 7720, + "text_loss": 0.5162558555603027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00019177646172026513, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12452680.0, + "repeat_count": 0.0, + "routers_loss": 0.0014514096546918154, + "skip_count": 0.0, + "step": 7722, + "text_loss": 0.5753642916679382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00019153280861753497, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12455348.0, + "repeat_count": 0.0, + "routers_loss": 0.002202774863690138, + "skip_count": 1.0, + "step": 7724, + "text_loss": 0.5751997232437134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00019128927372001454, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 12458098.0, + "repeat_count": 0.0, + "routers_loss": 0.005171069409698248, + "skip_count": 0.0, + "step": 7726, + "text_loss": 0.22252975404262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019104585712102678, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12460958.0, + "repeat_count": 0.0, + "routers_loss": 0.0041033923625946045, + "skip_count": 0.0, + "step": 7728, + "text_loss": 0.18611937761306763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00019080255891384945, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12463596.0, + "repeat_count": 1.0, + "routers_loss": 0.0012201941572129726, + "skip_count": 0.0, + "step": 7730, + "text_loss": 0.47347909212112427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001905593791917148, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 12467021.0, + "repeat_count": 2.0, + "routers_loss": 0.005837214644998312, + "skip_count": 2.0, + "step": 7732, + "text_loss": 0.2055564969778061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00019031631804780974, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12469743.0, + "repeat_count": 0.0, + "routers_loss": 0.0010269953636452556, + "skip_count": 0.0, + "step": 7734, + "text_loss": 0.45995602011680603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00019007337557527582, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12473082.0, + "repeat_count": 0.0, + "routers_loss": 0.00436213007196784, + "skip_count": 1.0, + "step": 7736, + "text_loss": 0.4515823721885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00018983055186720888, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 12476100.0, + "repeat_count": 0.0, + "routers_loss": 0.003051829058676958, + "skip_count": 2.0, + "step": 7738, + "text_loss": 0.12298467755317688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0001895878470166597, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 12480231.0, + "repeat_count": 0.0, + "routers_loss": 0.008164191618561745, + "skip_count": 2.0, + "step": 7740, + "text_loss": 0.17456457018852234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.347519812151454, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.00018934526111663314, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12483894.0, + "repeat_count": 0.0, + "routers_loss": 0.008653721772134304, + "skip_count": 1.0, + "step": 7742, + "text_loss": 0.7125775814056396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00018910279426008857, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12488077.0, + "repeat_count": 0.0, + "routers_loss": 0.005024447571486235, + "skip_count": 6.0, + "step": 7744, + "text_loss": 0.833778977394104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00018886044653993966, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12490999.0, + "repeat_count": 0.0, + "routers_loss": 0.002690888475626707, + "skip_count": 0.0, + "step": 7746, + "text_loss": 0.15594039857387543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00018861821804905466, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12494765.0, + "repeat_count": 0.0, + "routers_loss": 0.006087568122893572, + "skip_count": 0.0, + "step": 7748, + "text_loss": 0.2696777880191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00018837610888025586, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12497741.0, + "repeat_count": 0.0, + "routers_loss": 0.0014629303477704525, + "skip_count": 0.0, + "step": 7750, + "text_loss": 0.6801294684410095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.00018813411912631996, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12500585.0, + "repeat_count": 0.0, + "routers_loss": 0.001163579523563385, + "skip_count": 0.0, + "step": 7752, + "text_loss": 0.41069695353507996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00018789224887997796, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12503579.0, + "repeat_count": 2.0, + "routers_loss": 0.009436148218810558, + "skip_count": 0.0, + "step": 7754, + "text_loss": 0.6993107795715332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00018765049823391472, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 12506698.0, + "repeat_count": 1.0, + "routers_loss": 0.002098206663504243, + "skip_count": 2.0, + "step": 7756, + "text_loss": 0.5704247951507568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018740886728077, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12509869.0, + "repeat_count": 0.0, + "routers_loss": 0.002066673245280981, + "skip_count": 1.0, + "step": 7758, + "text_loss": 0.7605635523796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00018716735611313707, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 12513433.0, + "repeat_count": 0.0, + "routers_loss": 0.0023439819924533367, + "skip_count": 1.0, + "step": 7760, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.441444085705896, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00018692596482356333, + "loss": 0.0057, + "macro_f1": 0.9255813956260681, + "num_tokens": 12516817.0, + "repeat_count": 3.0, + "routers_loss": 0.039019811898469925, + "skip_count": 4.0, + "step": 7762, + "text_loss": 0.3105330467224121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00018668469350455048, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12519357.0, + "repeat_count": 0.0, + "routers_loss": 0.002269966993480921, + "skip_count": 0.0, + "step": 7764, + "text_loss": 0.3700210452079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00018644354224855414, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12522072.0, + "repeat_count": 0.0, + "routers_loss": 0.001265842467546463, + "skip_count": 0.0, + "step": 7766, + "text_loss": 0.6737633943557739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00018620251114798386, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12524999.0, + "repeat_count": 0.0, + "routers_loss": 0.006547329016029835, + "skip_count": 1.0, + "step": 7768, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001859616002952033, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12527785.0, + "repeat_count": 2.0, + "routers_loss": 0.010791841894388199, + "skip_count": 3.0, + "step": 7770, + "text_loss": 0.3069820702075958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0001857208097825299, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12530801.0, + "repeat_count": 0.0, + "routers_loss": 0.00492103723809123, + "skip_count": 2.0, + "step": 7772, + "text_loss": 0.2524295151233673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001854801397022351, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12533919.0, + "repeat_count": 0.0, + "routers_loss": 0.001942967064678669, + "skip_count": 0.0, + "step": 7774, + "text_loss": 0.7855241894721985 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018523959014654407, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12537265.0, + "repeat_count": 2.0, + "routers_loss": 0.00987488217651844, + "skip_count": 2.0, + "step": 7776, + "text_loss": 0.2767317593097687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00018499916120763582, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12539695.0, + "repeat_count": 0.0, + "routers_loss": 0.0054283770732581615, + "skip_count": 1.0, + "step": 7778, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00018475885297764306, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12542881.0, + "repeat_count": 2.0, + "routers_loss": 0.00797359924763441, + "skip_count": 0.0, + "step": 7780, + "text_loss": 0.3738224506378174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0001845186655486527, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12546530.0, + "repeat_count": 0.0, + "routers_loss": 0.0045951665379107, + "skip_count": 0.0, + "step": 7782, + "text_loss": 0.2511517107486725 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 36.54476078661579, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00018427859901270482, + "loss": 0.0055, + "macro_f1": 0.9452888369560242, + "num_tokens": 12549439.0, + "repeat_count": 1.0, + "routers_loss": 0.02312052994966507, + "skip_count": 4.0, + "step": 7784, + "text_loss": 0.3837030827999115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 36.55415321397123, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.059814453125, + "learning_rate": 0.00018403865346179344, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 12553211.0, + "repeat_count": 1.0, + "routers_loss": 0.014698561280965805, + "skip_count": 3.0, + "step": 7786, + "text_loss": 0.510159432888031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.563545641326684, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00018379882898786603, + "loss": 0.0075, + "macro_f1": 0.8803418874740601, + "num_tokens": 12556497.0, + "repeat_count": 2.0, + "routers_loss": 0.023926246911287308, + "skip_count": 7.0, + "step": 7788, + "text_loss": 0.44811317324638367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00018355912568282384, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12559778.0, + "repeat_count": 0.0, + "routers_loss": 0.0011187797645106912, + "skip_count": 0.0, + "step": 7790, + "text_loss": 0.32099616527557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018331954363852166, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12562610.0, + "repeat_count": 0.0, + "routers_loss": 0.0005356677575036883, + "skip_count": 0.0, + "step": 7792, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001830800829467677, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12565886.0, + "repeat_count": 2.0, + "routers_loss": 0.0017101728590205312, + "skip_count": 0.0, + "step": 7794, + "text_loss": 0.4234761595726013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00018284074369932386, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12568728.0, + "repeat_count": 0.0, + "routers_loss": 0.0012841494753956795, + "skip_count": 0.0, + "step": 7796, + "text_loss": 0.41109147667884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001826015259879053, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12572231.0, + "repeat_count": 0.0, + "routers_loss": 0.0022388407960534096, + "skip_count": 0.0, + "step": 7798, + "text_loss": 0.5459926128387451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00018236242990418074, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12574968.0, + "repeat_count": 0.0, + "routers_loss": 0.0019992550369352102, + "skip_count": 0.0, + "step": 7800, + "text_loss": 0.5028481483459473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001821234555397722, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12579074.0, + "repeat_count": 0.0, + "routers_loss": 0.002936388598755002, + "skip_count": 2.0, + "step": 7802, + "text_loss": 0.2377086579799652 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00018188460298625503, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12581912.0, + "repeat_count": 1.0, + "routers_loss": 0.0026762608904391527, + "skip_count": 0.0, + "step": 7804, + "text_loss": 0.13887254893779755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 36.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00018164587233515824, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 12585020.0, + "repeat_count": 3.0, + "routers_loss": 0.003901638789102435, + "skip_count": 1.0, + "step": 7806, + "text_loss": 0.35454171895980835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00018140726367796373, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12588310.0, + "repeat_count": 0.0, + "routers_loss": 0.0031358697451651096, + "skip_count": 2.0, + "step": 7808, + "text_loss": 0.3567306697368622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00018116877710610673, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12591735.0, + "repeat_count": 0.0, + "routers_loss": 0.002310588024556637, + "skip_count": 1.0, + "step": 7810, + "text_loss": 0.45357072353363037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00018093041271097582, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12595232.0, + "repeat_count": 0.0, + "routers_loss": 0.005600228440016508, + "skip_count": 2.0, + "step": 7812, + "text_loss": 0.4179847836494446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.685647196947464, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00018069217058391267, + "loss": 0.006, + "macro_f1": 0.6603773832321167, + "num_tokens": 12598367.0, + "repeat_count": 1.0, + "routers_loss": 0.04015933722257614, + "skip_count": 1.0, + "step": 7814, + "text_loss": 0.17874565720558167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00018045405081621214, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12601864.0, + "repeat_count": 0.0, + "routers_loss": 0.005119446665048599, + "skip_count": 1.0, + "step": 7816, + "text_loss": 0.6867854595184326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00018021605349912207, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12605268.0, + "repeat_count": 0.0, + "routers_loss": 0.0005990012432448566, + "skip_count": 0.0, + "step": 7818, + "text_loss": 0.9084970355033875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00017997817872384358, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12608093.0, + "repeat_count": 0.0, + "routers_loss": 0.008712377399206161, + "skip_count": 1.0, + "step": 7820, + "text_loss": 0.19413328170776367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00017974042658153066, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12611001.0, + "repeat_count": 0.0, + "routers_loss": 0.007535711396485567, + "skip_count": 1.0, + "step": 7822, + "text_loss": 0.2672932744026184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001795027971632905, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12614584.0, + "repeat_count": 1.0, + "routers_loss": 0.006770546548068523, + "skip_count": 3.0, + "step": 7824, + "text_loss": 0.22805163264274597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00017926529056018297, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12617519.0, + "repeat_count": 0.0, + "routers_loss": 0.0010458873584866524, + "skip_count": 0.0, + "step": 7826, + "text_loss": 0.385499507188797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00017902790686322102, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12621566.0, + "repeat_count": 1.0, + "routers_loss": 0.00634258147329092, + "skip_count": 0.0, + "step": 7828, + "text_loss": 0.8044118285179138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00017879064616337076, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12624751.0, + "repeat_count": 0.0, + "routers_loss": 0.0053052278235554695, + "skip_count": 3.0, + "step": 7830, + "text_loss": 0.264322966337204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00017855350855155088, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12628478.0, + "repeat_count": 0.0, + "routers_loss": 0.0028291696216911077, + "skip_count": 0.0, + "step": 7832, + "text_loss": 0.20611460506916046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00017831649411863287, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12632027.0, + "repeat_count": 0.0, + "routers_loss": 0.0009586421074345708, + "skip_count": 1.0, + "step": 7834, + "text_loss": 0.4119716286659241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00017807960295544118, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12635144.0, + "repeat_count": 0.0, + "routers_loss": 0.012304541654884815, + "skip_count": 2.0, + "step": 7836, + "text_loss": 0.28647977113723755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001778428351527529, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12638719.0, + "repeat_count": 0.0, + "routers_loss": 0.005212076939642429, + "skip_count": 2.0, + "step": 7838, + "text_loss": 0.630459189414978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0001776061908012979, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12642119.0, + "repeat_count": 0.0, + "routers_loss": 0.00183707510586828, + "skip_count": 0.0, + "step": 7840, + "text_loss": 0.5905961990356445 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001773696699917588, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12645077.0, + "repeat_count": 1.0, + "routers_loss": 0.0058263009414076805, + "skip_count": 0.0, + "step": 7842, + "text_loss": 0.41949576139450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00017713327281477077, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12648964.0, + "repeat_count": 0.0, + "routers_loss": 0.001586507773026824, + "skip_count": 0.0, + "step": 7844, + "text_loss": 0.5048848390579224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00017689699936092163, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 12651934.0, + "repeat_count": 0.0, + "routers_loss": 0.002397194504737854, + "skip_count": 0.0, + "step": 7846, + "text_loss": 0.23879878222942352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.84531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001766608497207518, + "loss": 0.0054, + "macro_f1": 0.5492662787437439, + "num_tokens": 12654907.0, + "repeat_count": 0.0, + "routers_loss": 0.016742069274187088, + "skip_count": 2.0, + "step": 7848, + "text_loss": 0.23400072753429413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001764248239847544, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 12658765.0, + "repeat_count": 0.0, + "routers_loss": 0.007037387229502201, + "skip_count": 2.0, + "step": 7850, + "text_loss": 0.26165497303009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.017822265625, + "learning_rate": 0.00017618892224337463, + "loss": 0.0044, + "macro_f1": 0.5492662787437439, + "num_tokens": 12662024.0, + "repeat_count": 0.0, + "routers_loss": 0.017352160066366196, + "skip_count": 2.0, + "step": 7852, + "text_loss": 0.23813043534755707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017595314458701084, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12665751.0, + "repeat_count": 0.0, + "routers_loss": 0.005349365528672934, + "skip_count": 3.0, + "step": 7854, + "text_loss": 0.14920757710933685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00017571749110601337, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12668823.0, + "repeat_count": 0.0, + "routers_loss": 0.0037689812015742064, + "skip_count": 2.0, + "step": 7856, + "text_loss": 0.2198697030544281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017548196189068506, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12672367.0, + "repeat_count": 0.0, + "routers_loss": 0.0006363615393638611, + "skip_count": 0.0, + "step": 7858, + "text_loss": 0.5338839888572693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00017524655703128112, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12675217.0, + "repeat_count": 0.0, + "routers_loss": 0.002691479865461588, + "skip_count": 0.0, + "step": 7860, + "text_loss": 0.17463763058185577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00017501127661800908, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12678796.0, + "repeat_count": 0.0, + "routers_loss": 0.002262329449877143, + "skip_count": 0.0, + "step": 7862, + "text_loss": 0.4637797474861145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00017477612074102899, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12681631.0, + "repeat_count": 0.0, + "routers_loss": 0.00115531450137496, + "skip_count": 0.0, + "step": 7864, + "text_loss": 0.6089238524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00017454108949045295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12685647.0, + "repeat_count": 0.0, + "routers_loss": 0.00260268640704453, + "skip_count": 0.0, + "step": 7866, + "text_loss": 0.5876018404960632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00017430618295634514, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12688995.0, + "repeat_count": 0.0, + "routers_loss": 0.002731681102886796, + "skip_count": 0.0, + "step": 7868, + "text_loss": 0.35076001286506653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00017407140122872262, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12692100.0, + "repeat_count": 1.0, + "routers_loss": 0.003314645728096366, + "skip_count": 1.0, + "step": 7870, + "text_loss": 0.5313478112220764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.958027590255355, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017383674439755393, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12695117.0, + "repeat_count": 0.0, + "routers_loss": 0.010385016910731792, + "skip_count": 1.0, + "step": 7872, + "text_loss": 0.5092368125915527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00017360221255276016, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12697678.0, + "repeat_count": 0.0, + "routers_loss": 0.001273582922294736, + "skip_count": 0.0, + "step": 7874, + "text_loss": 0.5282881855964661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00017336780578421418, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12702132.0, + "repeat_count": 0.0, + "routers_loss": 0.0007510313298553228, + "skip_count": 0.0, + "step": 7876, + "text_loss": 0.49093571305274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001731335241817412, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12705413.0, + "repeat_count": 0.0, + "routers_loss": 0.005138787440955639, + "skip_count": 2.0, + "step": 7878, + "text_loss": 0.7503541111946106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001728993678351184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12708310.0, + "repeat_count": 2.0, + "routers_loss": 0.004379773512482643, + "skip_count": 0.0, + "step": 7880, + "text_loss": 0.5942456126213074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001726653368340747, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12711043.0, + "repeat_count": 0.0, + "routers_loss": 0.005271450616419315, + "skip_count": 2.0, + "step": 7882, + "text_loss": 0.348360538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00017243143126829163, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12714473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015764752170071006, + "skip_count": 1.0, + "step": 7884, + "text_loss": 0.45971861481666565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.000172197651227402, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12717832.0, + "repeat_count": 0.0, + "routers_loss": 0.00040649910806678236, + "skip_count": 0.0, + "step": 7886, + "text_loss": 0.5996841788291931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00017196399680099078, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12720479.0, + "repeat_count": 0.0, + "routers_loss": 0.00473182974383235, + "skip_count": 2.0, + "step": 7888, + "text_loss": 0.40346208214759827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00017173046807859483, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12723104.0, + "repeat_count": 0.0, + "routers_loss": 0.0020138369873166084, + "skip_count": 0.0, + "step": 7890, + "text_loss": 0.6878634095191956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.05165835045494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001714970651497027, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 12725967.0, + "repeat_count": 0.0, + "routers_loss": 0.008381367661058903, + "skip_count": 1.0, + "step": 7892, + "text_loss": 0.9161711931228638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00017126378810375498, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 12728819.0, + "repeat_count": 1.0, + "routers_loss": 0.0037658829241991043, + "skip_count": 0.0, + "step": 7894, + "text_loss": 0.4447716772556305 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00017103063703014372, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12731806.0, + "repeat_count": 0.0, + "routers_loss": 0.0022742559667676687, + "skip_count": 0.0, + "step": 7896, + "text_loss": 0.9140825867652893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00017079761201821298, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 12734649.0, + "repeat_count": 0.0, + "routers_loss": 0.002157264854758978, + "skip_count": 0.0, + "step": 7898, + "text_loss": 0.268303781747818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001705647131572583, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12737889.0, + "repeat_count": 1.0, + "routers_loss": 0.01064873393625021, + "skip_count": 1.0, + "step": 7900, + "text_loss": 0.36009490489959717 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017033194053652685, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12740821.0, + "repeat_count": 1.0, + "routers_loss": 0.0062920586206018925, + "skip_count": 0.0, + "step": 7902, + "text_loss": 0.5301805138587952 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.00017009929424521782, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 12743876.0, + "repeat_count": 1.0, + "routers_loss": 0.0033694824669510126, + "skip_count": 1.0, + "step": 7904, + "text_loss": 1.026949167251587 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.117405341943055, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00016986677437248155, + "loss": 0.0071, + "macro_f1": 0.8817967176437378, + "num_tokens": 12747623.0, + "repeat_count": 2.0, + "routers_loss": 0.05076088383793831, + "skip_count": 3.0, + "step": 7906, + "text_loss": 0.33465588092803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016963438100742014, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12751255.0, + "repeat_count": 0.0, + "routers_loss": 0.0005921403644606471, + "skip_count": 0.0, + "step": 7908, + "text_loss": 0.3498881757259369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00016940211423908713, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 12754297.0, + "repeat_count": 0.0, + "routers_loss": 0.004132566973567009, + "skip_count": 0.0, + "step": 7910, + "text_loss": 0.2874198853969574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0001691699741564876, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12756969.0, + "repeat_count": 0.0, + "routers_loss": 0.0024724705144762993, + "skip_count": 1.0, + "step": 7912, + "text_loss": 0.10593545436859131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016893796084857806, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12760261.0, + "repeat_count": 0.0, + "routers_loss": 0.002991671208292246, + "skip_count": 0.0, + "step": 7914, + "text_loss": 0.1331545114517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00016870607440426643, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12762971.0, + "repeat_count": 0.0, + "routers_loss": 0.0018167285015806556, + "skip_count": 0.0, + "step": 7916, + "text_loss": 0.496826171875 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00016847431491241207, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12765949.0, + "repeat_count": 1.0, + "routers_loss": 0.0033364067785441875, + "skip_count": 0.0, + "step": 7918, + "text_loss": 0.43522849678993225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001682426824618256, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 12769201.0, + "repeat_count": 0.0, + "routers_loss": 0.001313596498221159, + "skip_count": 0.0, + "step": 7920, + "text_loss": 0.8691539168357849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.19254476078662, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00016801117714126908, + "loss": 0.0108, + "macro_f1": 0.6603773832321167, + "num_tokens": 12773308.0, + "repeat_count": 1.0, + "routers_loss": 0.02579287625849247, + "skip_count": 1.0, + "step": 7922, + "text_loss": 0.275301069021225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00016777979903945568, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 12776166.0, + "repeat_count": 0.0, + "routers_loss": 0.010501758195459843, + "skip_count": 1.0, + "step": 7924, + "text_loss": 0.32124993205070496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001675485482450499, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12779965.0, + "repeat_count": 0.0, + "routers_loss": 0.0063389060087502, + "skip_count": 2.0, + "step": 7926, + "text_loss": 0.2527695894241333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00016731742484666774, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12783019.0, + "repeat_count": 0.0, + "routers_loss": 0.002796935848891735, + "skip_count": 0.0, + "step": 7928, + "text_loss": 0.18767669796943665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001670864289328759, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12786291.0, + "repeat_count": 0.0, + "routers_loss": 0.007973561994731426, + "skip_count": 2.0, + "step": 7930, + "text_loss": 0.29628485441207886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00016685556059219253, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 12789566.0, + "repeat_count": 4.0, + "routers_loss": 0.011405733413994312, + "skip_count": 6.0, + "step": 7932, + "text_loss": 0.16635073721408844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00016662481991308682, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12792533.0, + "repeat_count": 0.0, + "routers_loss": 0.0012368770549073815, + "skip_count": 1.0, + "step": 7934, + "text_loss": 0.4196353852748871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000166394206983979, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12795619.0, + "repeat_count": 0.0, + "routers_loss": 0.0036002211272716522, + "skip_count": 1.0, + "step": 7936, + "text_loss": 0.17559808492660522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00016616372189324035, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12799702.0, + "repeat_count": 1.0, + "routers_loss": 0.0039332108572125435, + "skip_count": 0.0, + "step": 7938, + "text_loss": 0.603410542011261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00016593336472919324, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12802704.0, + "repeat_count": 0.0, + "routers_loss": 0.0008303318754769862, + "skip_count": 0.0, + "step": 7940, + "text_loss": 0.5331749320030212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.28646903434106, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00016570313558011098, + "loss": 0.0058, + "macro_f1": 0.6601307392120361, + "num_tokens": 12805630.0, + "repeat_count": 1.0, + "routers_loss": 0.05092398822307587, + "skip_count": 2.0, + "step": 7942, + "text_loss": 0.17398510873317719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00016547303453421774, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12809065.0, + "repeat_count": 0.0, + "routers_loss": 0.0006886976188980043, + "skip_count": 0.0, + "step": 7944, + "text_loss": 0.3419797718524933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00016524306167968878, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12812641.0, + "repeat_count": 1.0, + "routers_loss": 0.005634502973407507, + "skip_count": 3.0, + "step": 7946, + "text_loss": 0.5877651572227478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00016501321710465005, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12815527.0, + "repeat_count": 0.0, + "routers_loss": 0.0020598487462848425, + "skip_count": 0.0, + "step": 7948, + "text_loss": 0.3558528423309326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001647835008971783, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12819103.0, + "repeat_count": 0.0, + "routers_loss": 0.005946476943790913, + "skip_count": 2.0, + "step": 7950, + "text_loss": 0.5800213813781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00016455391314530154, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12822423.0, + "repeat_count": 0.0, + "routers_loss": 0.010360358282923698, + "skip_count": 2.0, + "step": 7952, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00016432445393699802, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12826180.0, + "repeat_count": 0.0, + "routers_loss": 0.003017681185156107, + "skip_count": 0.0, + "step": 7954, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00016409512336019698, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12829196.0, + "repeat_count": 0.0, + "routers_loss": 0.0008854938205331564, + "skip_count": 0.0, + "step": 7956, + "text_loss": 0.2776578366756439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.00016386592150277834, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 12831983.0, + "repeat_count": 0.0, + "routers_loss": 0.0023990103509277105, + "skip_count": 0.0, + "step": 7958, + "text_loss": 0.46686989068984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 37.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001636368484525727, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 12834889.0, + "repeat_count": 0.0, + "routers_loss": 0.009835032746195793, + "skip_count": 5.0, + "step": 7960, + "text_loss": 0.22224856913089752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016340790429736118, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12837950.0, + "repeat_count": 0.0, + "routers_loss": 0.0018618656322360039, + "skip_count": 0.0, + "step": 7962, + "text_loss": 0.5101882815361023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00016317908912487578, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12840981.0, + "repeat_count": 1.0, + "routers_loss": 0.001275144051760435, + "skip_count": 1.0, + "step": 7964, + "text_loss": 0.40567103028297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016295040302279873, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12844044.0, + "repeat_count": 0.0, + "routers_loss": 0.003117429558187723, + "skip_count": 2.0, + "step": 7966, + "text_loss": 0.6888198852539062 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00016272184607876312, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12847350.0, + "repeat_count": 2.0, + "routers_loss": 0.006585797294974327, + "skip_count": 4.0, + "step": 7968, + "text_loss": 0.19813506305217743 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001624934183803523, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12850285.0, + "repeat_count": 1.0, + "routers_loss": 0.0043576788157224655, + "skip_count": 1.0, + "step": 7970, + "text_loss": 0.6108269691467285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.427355444672735, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00016226512001510024, + "loss": 0.0039, + "macro_f1": 0.5492662787437439, + "num_tokens": 12853993.0, + "repeat_count": 0.0, + "routers_loss": 0.011879517696797848, + "skip_count": 2.0, + "step": 7972, + "text_loss": 0.42478689551353455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00016203695107049117, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 12857022.0, + "repeat_count": 0.0, + "routers_loss": 0.0016375730047002435, + "skip_count": 0.0, + "step": 7974, + "text_loss": 0.5130020976066589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001618089116339601, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12860764.0, + "repeat_count": 0.0, + "routers_loss": 0.0006649247952736914, + "skip_count": 0.0, + "step": 7976, + "text_loss": 1.0629136562347412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.455532726739065, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00016158100179289208, + "loss": 0.0062, + "macro_f1": 0.6603773832321167, + "num_tokens": 12864066.0, + "repeat_count": 1.0, + "routers_loss": 0.03140667825937271, + "skip_count": 1.0, + "step": 7978, + "text_loss": 0.4241345226764679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 37.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001613532216346226, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12867555.0, + "repeat_count": 0.0, + "routers_loss": 0.010257012210786343, + "skip_count": 4.0, + "step": 7980, + "text_loss": 0.6085613369941711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001611255712464374, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 12871415.0, + "repeat_count": 0.0, + "routers_loss": 0.00783725269138813, + "skip_count": 1.0, + "step": 7982, + "text_loss": 0.15661844611167908 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 0.00016089805071557256, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12874195.0, + "repeat_count": 1.0, + "routers_loss": 0.0027650597039610147, + "skip_count": 2.0, + "step": 7984, + "text_loss": 0.4938865005970001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016067066012921439, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 12878084.0, + "repeat_count": 1.0, + "routers_loss": 0.04647083953022957, + "skip_count": 0.0, + "step": 7986, + "text_loss": 0.2973119020462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00016044339957449938, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12881182.0, + "repeat_count": 0.0, + "routers_loss": 0.002192265819758177, + "skip_count": 0.0, + "step": 7988, + "text_loss": 0.2623208165168762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00016021626913851418, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12884028.0, + "repeat_count": 0.0, + "routers_loss": 0.0023096329532563686, + "skip_count": 0.0, + "step": 7990, + "text_loss": 0.3752247989177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015998926890829562, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 12887759.0, + "repeat_count": 0.0, + "routers_loss": 0.03038526326417923, + "skip_count": 1.0, + "step": 7992, + "text_loss": 0.2609226405620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001597623989708306, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12890976.0, + "repeat_count": 0.0, + "routers_loss": 0.0015199477784335613, + "skip_count": 0.0, + "step": 7994, + "text_loss": 0.6512867212295532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00015953565941305615, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12894112.0, + "repeat_count": 0.0, + "routers_loss": 0.0024166766088455915, + "skip_count": 0.0, + "step": 7996, + "text_loss": 0.5539866089820862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0001593090503218591, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12896857.0, + "repeat_count": 1.0, + "routers_loss": 0.005081235896795988, + "skip_count": 2.0, + "step": 7998, + "text_loss": 0.6631022691726685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00015908257178407682, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12900075.0, + "repeat_count": 1.0, + "routers_loss": 0.0024711282458156347, + "skip_count": 0.0, + "step": 8000, + "text_loss": 0.3309785723686218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.5682418550044, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00015885622388649617, + "loss": 0.0059, + "macro_f1": 0.6601307392120361, + "num_tokens": 12903845.0, + "repeat_count": 1.0, + "routers_loss": 0.04024988412857056, + "skip_count": 2.0, + "step": 8002, + "text_loss": 0.2384071946144104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.00015863000671585405, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 12907694.0, + "repeat_count": 1.0, + "routers_loss": 0.001953886589035392, + "skip_count": 2.0, + "step": 8004, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00015840392035883726, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12910871.0, + "repeat_count": 0.0, + "routers_loss": 0.002982128644362092, + "skip_count": 2.0, + "step": 8006, + "text_loss": 0.2589346170425415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001581779649020827, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12914484.0, + "repeat_count": 0.0, + "routers_loss": 0.0009384988807141781, + "skip_count": 0.0, + "step": 8008, + "text_loss": 0.5727795362472534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00015795214043217654, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12917480.0, + "repeat_count": 0.0, + "routers_loss": 0.008854437619447708, + "skip_count": 2.0, + "step": 8010, + "text_loss": 0.24354904890060425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00015772644703565563, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 12920383.0, + "repeat_count": 0.0, + "routers_loss": 0.001689503900706768, + "skip_count": 0.0, + "step": 8012, + "text_loss": 0.5372336506843567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00015750088479900588, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12923886.0, + "repeat_count": 0.0, + "routers_loss": 0.002284591319039464, + "skip_count": 0.0, + "step": 8014, + "text_loss": 0.1708722710609436 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 37.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015727545380866316, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12926998.0, + "repeat_count": 1.0, + "routers_loss": 0.004594483878463507, + "skip_count": 4.0, + "step": 8016, + "text_loss": 0.26784324645996094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001570501541510131, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12929726.0, + "repeat_count": 1.0, + "routers_loss": 0.0021998141892254353, + "skip_count": 0.0, + "step": 8018, + "text_loss": 0.8051869869232178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00015682498591239086, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12932182.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623414881527424, + "skip_count": 1.0, + "step": 8020, + "text_loss": 0.8431181907653809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00015659994917908144, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12935338.0, + "repeat_count": 0.0, + "routers_loss": 0.0014909361489117146, + "skip_count": 1.0, + "step": 8022, + "text_loss": 0.6168642640113831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001563750440373191, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12938484.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295510292053223, + "skip_count": 0.0, + "step": 8024, + "text_loss": 0.2694014608860016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.68095098326974, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029296875, + "learning_rate": 0.00015615027057328828, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 12942045.0, + "repeat_count": 0.0, + "routers_loss": 0.018341995775699615, + "skip_count": 2.0, + "step": 8026, + "text_loss": 0.8151478171348572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 37.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0001559256288731224, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12945547.0, + "repeat_count": 2.0, + "routers_loss": 0.0023289949167519808, + "skip_count": 1.0, + "step": 8028, + "text_loss": 0.613464891910553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015570111902290463, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12949544.0, + "repeat_count": 0.0, + "routers_loss": 0.006635872647166252, + "skip_count": 2.0, + "step": 8030, + "text_loss": 0.17417465150356293 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00015547674110866756, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12952838.0, + "repeat_count": 1.0, + "routers_loss": 0.006023989990353584, + "skip_count": 1.0, + "step": 8032, + "text_loss": 0.4801837205886841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00015525249521639319, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 12956329.0, + "repeat_count": 0.0, + "routers_loss": 0.005706884432584047, + "skip_count": 0.0, + "step": 8034, + "text_loss": 0.2028084248304367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000155028381432013, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 12959122.0, + "repeat_count": 0.0, + "routers_loss": 0.003527123713865876, + "skip_count": 2.0, + "step": 8036, + "text_loss": 0.39474430680274963 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00015480439984140776, + "loss": 0.0029, + "macro_f1": 1.0, + "num_tokens": 12962546.0, + "repeat_count": 1.0, + "routers_loss": 0.010415437631309032, + "skip_count": 2.0, + "step": 8038, + "text_loss": 0.20412345230579376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0001545805505304077, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12965861.0, + "repeat_count": 0.0, + "routers_loss": 0.001566931139677763, + "skip_count": 0.0, + "step": 8040, + "text_loss": 0.5129821300506592 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 37.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001543568335847923, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12968677.0, + "repeat_count": 3.0, + "routers_loss": 0.0037196793127804995, + "skip_count": 0.0, + "step": 8042, + "text_loss": 0.755020260810852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00015413324909029031, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 12972001.0, + "repeat_count": 0.0, + "routers_loss": 0.0010940275387838483, + "skip_count": 0.0, + "step": 8044, + "text_loss": 0.48672133684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00015390979713257968, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12974765.0, + "repeat_count": 0.0, + "routers_loss": 0.011106903664767742, + "skip_count": 1.0, + "step": 8046, + "text_loss": 0.1727766990661621 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 37.78426768417963, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.048828125, + "learning_rate": 0.00015368647779728757, + "loss": 0.006, + "macro_f1": 0.886363685131073, + "num_tokens": 12979127.0, + "repeat_count": 3.0, + "routers_loss": 0.05134248360991478, + "skip_count": 6.0, + "step": 8048, + "text_loss": 0.33233317732810974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00015346329116999057, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12982812.0, + "repeat_count": 0.0, + "routers_loss": 0.0027500339783728123, + "skip_count": 0.0, + "step": 8050, + "text_loss": 0.8176849484443665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.80305253889052, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00015324023733621412, + "loss": 0.005, + "macro_f1": 0.32098764181137085, + "num_tokens": 12985740.0, + "repeat_count": 0.0, + "routers_loss": 0.030734945088624954, + "skip_count": 2.0, + "step": 8052, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00015301731638143285, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12988646.0, + "repeat_count": 0.0, + "routers_loss": 0.002358534839004278, + "skip_count": 2.0, + "step": 8054, + "text_loss": 0.5656245946884155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001527945283910705, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 12991518.0, + "repeat_count": 2.0, + "routers_loss": 0.007991814985871315, + "skip_count": 3.0, + "step": 8056, + "text_loss": 0.26438817381858826 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00015257187345049983, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 12994847.0, + "repeat_count": 1.0, + "routers_loss": 0.011761264875531197, + "skip_count": 1.0, + "step": 8058, + "text_loss": 0.1801673173904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 37.8406222483123, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0001523493516450427, + "loss": 0.004, + "macro_f1": 0.8823530077934265, + "num_tokens": 12997874.0, + "repeat_count": 1.0, + "routers_loss": 0.021669765934348106, + "skip_count": 2.0, + "step": 8060, + "text_loss": 0.3278379738330841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001521269630599698, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13000504.0, + "repeat_count": 0.0, + "routers_loss": 0.002388916676864028, + "skip_count": 0.0, + "step": 8062, + "text_loss": 0.5396623611450195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00015190470778050086, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 13003620.0, + "repeat_count": 0.0, + "routers_loss": 0.007719808723777533, + "skip_count": 1.0, + "step": 8064, + "text_loss": 0.1989232450723648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00015168258589180462, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 13007410.0, + "repeat_count": 0.0, + "routers_loss": 0.0007461659261025488, + "skip_count": 0.0, + "step": 8066, + "text_loss": 0.5293997526168823 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00015146059747899848, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13010240.0, + "repeat_count": 1.0, + "routers_loss": 0.005515575874596834, + "skip_count": 0.0, + "step": 8068, + "text_loss": 0.2776186466217041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00015123874262714892, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13012728.0, + "repeat_count": 0.0, + "routers_loss": 0.0026730166282504797, + "skip_count": 0.0, + "step": 8070, + "text_loss": 0.5902766585350037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.00015101702142127088, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13015616.0, + "repeat_count": 0.0, + "routers_loss": 0.002244985429570079, + "skip_count": 0.0, + "step": 8072, + "text_loss": 0.21447396278381348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015079543394632878, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13019846.0, + "repeat_count": 0.0, + "routers_loss": 0.001963787479326129, + "skip_count": 0.0, + "step": 8074, + "text_loss": 0.22974267601966858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.915761667155856, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.053955078125, + "learning_rate": 0.00015057398028723513, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 13023036.0, + "repeat_count": 0.0, + "routers_loss": 0.02271878905594349, + "skip_count": 2.0, + "step": 8076, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015035266052885137, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13025840.0, + "repeat_count": 0.0, + "routers_loss": 0.0011732397833839059, + "skip_count": 0.0, + "step": 8078, + "text_loss": 0.44129177927970886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0001501314747559877, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13030031.0, + "repeat_count": 1.0, + "routers_loss": 0.015655985102057457, + "skip_count": 2.0, + "step": 8080, + "text_loss": 0.28889161348342896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00014991042305340286, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13033603.0, + "repeat_count": 0.0, + "routers_loss": 0.0012988687958568335, + "skip_count": 0.0, + "step": 8082, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00014968950550580434, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13036931.0, + "repeat_count": 0.0, + "routers_loss": 0.002425852930173278, + "skip_count": 0.0, + "step": 8084, + "text_loss": 0.35900676250457764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001494687221978482, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13040637.0, + "repeat_count": 0.0, + "routers_loss": 0.004092676565051079, + "skip_count": 1.0, + "step": 8086, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014924807321413893, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13043855.0, + "repeat_count": 0.0, + "routers_loss": 0.0009040542645379901, + "skip_count": 0.0, + "step": 8088, + "text_loss": 0.30341213941574097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001490275586392296, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13046903.0, + "repeat_count": 0.0, + "routers_loss": 0.0019248841563239694, + "skip_count": 0.0, + "step": 8090, + "text_loss": 0.4299648702144623 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000148807178557622, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 13050219.0, + "repeat_count": 0.0, + "routers_loss": 0.0008314658771269023, + "skip_count": 0.0, + "step": 8092, + "text_loss": 0.4521652162075043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00014858693305376598, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13053076.0, + "repeat_count": 0.0, + "routers_loss": 0.0007470731507055461, + "skip_count": 0.0, + "step": 8094, + "text_loss": 0.46265852451324463 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00014836682221206, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13056170.0, + "repeat_count": 1.0, + "routers_loss": 0.003292408073320985, + "skip_count": 0.0, + "step": 8096, + "text_loss": 0.6483868956565857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00014814684611685124, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13059181.0, + "repeat_count": 0.0, + "routers_loss": 0.001357200788334012, + "skip_count": 0.0, + "step": 8098, + "text_loss": 0.43141183257102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00014792700485243476, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13062124.0, + "repeat_count": 0.0, + "routers_loss": 0.0030062920413911343, + "skip_count": 0.0, + "step": 8100, + "text_loss": 0.26022693514823914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001477072985030542, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13065273.0, + "repeat_count": 0.0, + "routers_loss": 0.0006919128354638815, + "skip_count": 0.0, + "step": 8102, + "text_loss": 0.5927232503890991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00014748772715290144, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13068346.0, + "repeat_count": 0.0, + "routers_loss": 0.005062389187514782, + "skip_count": 0.0, + "step": 8104, + "text_loss": 0.1255214959383011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00014726829088611664, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13071384.0, + "repeat_count": 0.0, + "routers_loss": 0.0005492564523592591, + "skip_count": 0.0, + "step": 8106, + "text_loss": 0.6445038914680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00014704898978678817, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13074667.0, + "repeat_count": 0.0, + "routers_loss": 0.002470226027071476, + "skip_count": 0.0, + "step": 8108, + "text_loss": 0.5019628405570984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014682982393895256, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13077566.0, + "repeat_count": 0.0, + "routers_loss": 0.0008262090268544853, + "skip_count": 0.0, + "step": 8110, + "text_loss": 0.6075460314750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00014661079342659467, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13081042.0, + "repeat_count": 0.0, + "routers_loss": 0.00034181721275672317, + "skip_count": 0.0, + "step": 8112, + "text_loss": 0.7349393963813782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001463918983336474, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 13084151.0, + "repeat_count": 1.0, + "routers_loss": 0.01406828872859478, + "skip_count": 2.0, + "step": 8114, + "text_loss": 0.3122454285621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.00014617313874399173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13086998.0, + "repeat_count": 0.0, + "routers_loss": 0.002714085392653942, + "skip_count": 0.0, + "step": 8116, + "text_loss": 0.6545852422714233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00014595451474145677, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13090017.0, + "repeat_count": 0.0, + "routers_loss": 0.0073202489875257015, + "skip_count": 0.0, + "step": 8118, + "text_loss": 0.5487201809883118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00014573602640981947, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13093651.0, + "repeat_count": 0.0, + "routers_loss": 0.000667977670673281, + "skip_count": 0.0, + "step": 8120, + "text_loss": 0.672166109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00014551767383280535, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13097139.0, + "repeat_count": 0.0, + "routers_loss": 0.0020584615413099527, + "skip_count": 0.0, + "step": 8122, + "text_loss": 0.1996239423751831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.14088641033167, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00014529945709408726, + "loss": 0.0069, + "macro_f1": 0.6598639488220215, + "num_tokens": 13100493.0, + "repeat_count": 1.0, + "routers_loss": 0.013855135068297386, + "skip_count": 3.0, + "step": 8124, + "text_loss": 0.4099486768245697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0001450813762772863, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13103488.0, + "repeat_count": 0.0, + "routers_loss": 0.0014984552981331944, + "skip_count": 0.0, + "step": 8126, + "text_loss": 0.6307108402252197 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00014486343146597152, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 13106445.0, + "repeat_count": 1.0, + "routers_loss": 0.00430954247713089, + "skip_count": 0.0, + "step": 8128, + "text_loss": 0.6226127743721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.00014464562274365972, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13109258.0, + "repeat_count": 0.0, + "routers_loss": 0.003711461555212736, + "skip_count": 1.0, + "step": 8130, + "text_loss": 0.17819052934646606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00014442795019381567, + "loss": 0.0064, + "macro_f1": 0.6603773832321167, + "num_tokens": 13114206.0, + "repeat_count": 1.0, + "routers_loss": 0.015719098970294, + "skip_count": 1.0, + "step": 8132, + "text_loss": 0.28450697660446167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00014421041389985184, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13117351.0, + "repeat_count": 0.0, + "routers_loss": 0.0013113922905176878, + "skip_count": 0.0, + "step": 8134, + "text_loss": 0.310830682516098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00014399301394512858, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 13120228.0, + "repeat_count": 1.0, + "routers_loss": 0.001965439412742853, + "skip_count": 1.0, + "step": 8136, + "text_loss": 0.8635116815567017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00014377575041295393, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13123380.0, + "repeat_count": 1.0, + "routers_loss": 0.004898902028799057, + "skip_count": 2.0, + "step": 8138, + "text_loss": 0.5302467346191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0001435586233865836, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13126875.0, + "repeat_count": 0.0, + "routers_loss": 0.00031845085322856903, + "skip_count": 0.0, + "step": 8140, + "text_loss": 0.5913560390472412 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0001433416329492213, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 13129563.0, + "repeat_count": 1.0, + "routers_loss": 0.00298812473192811, + "skip_count": 1.0, + "step": 8142, + "text_loss": 0.5153398513793945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014312477918401807, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13132608.0, + "repeat_count": 0.0, + "routers_loss": 0.0026608197949826717, + "skip_count": 1.0, + "step": 8144, + "text_loss": 0.4554155766963959 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014290806217407272, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13136204.0, + "repeat_count": 1.0, + "routers_loss": 0.0027651884593069553, + "skip_count": 1.0, + "step": 8146, + "text_loss": 0.6349515318870544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00014269148200243148, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13138895.0, + "repeat_count": 0.0, + "routers_loss": 0.0006579195614904165, + "skip_count": 0.0, + "step": 8148, + "text_loss": 0.4629364013671875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014247503875208846, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 13142500.0, + "repeat_count": 1.0, + "routers_loss": 0.023065708577632904, + "skip_count": 0.0, + "step": 8150, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00014225873250598496, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13146203.0, + "repeat_count": 0.0, + "routers_loss": 0.007397830951958895, + "skip_count": 1.0, + "step": 8152, + "text_loss": 0.3225953280925751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00014204256334700988, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 13149517.0, + "repeat_count": 0.0, + "routers_loss": 0.004839105997234583, + "skip_count": 1.0, + "step": 8154, + "text_loss": 0.18435558676719666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00014182653135799995, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13152643.0, + "repeat_count": 0.0, + "routers_loss": 0.0028303388971835375, + "skip_count": 4.0, + "step": 8156, + "text_loss": 0.5836900472640991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001416106366217389, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13155213.0, + "repeat_count": 0.0, + "routers_loss": 0.0004012314020656049, + "skip_count": 0.0, + "step": 8158, + "text_loss": 0.3723861575126648 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 20.0, + "epoch": 38.30995010272967, + "f1_execute": 0.9714285731315613, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0001413948792209579, + "loss": 0.0065, + "macro_f1": 0.8793651461601257, + "num_tokens": 13158440.0, + "repeat_count": 2.0, + "routers_loss": 0.04377155378460884, + "skip_count": 9.0, + "step": 8160, + "text_loss": 0.32476910948753357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001411792592383357, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13162651.0, + "repeat_count": 0.0, + "routers_loss": 0.0011163362069055438, + "skip_count": 0.0, + "step": 8162, + "text_loss": 0.4890389144420624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.32873495744057, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014096377675649823, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13165406.0, + "repeat_count": 1.0, + "routers_loss": 0.012117774225771427, + "skip_count": 1.0, + "step": 8164, + "text_loss": 0.7763246893882751 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.33812738479601, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00014074843185801883, + "loss": 0.004, + "macro_f1": 0.9262410998344421, + "num_tokens": 13168402.0, + "repeat_count": 3.0, + "routers_loss": 0.009951545856893063, + "skip_count": 2.0, + "step": 8166, + "text_loss": 0.5038266777992249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00014053322462541802, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13171423.0, + "repeat_count": 1.0, + "routers_loss": 0.0021372761111706495, + "skip_count": 1.0, + "step": 8168, + "text_loss": 0.5634724497795105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014031815514116354, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13174713.0, + "repeat_count": 0.0, + "routers_loss": 0.0007417177548632026, + "skip_count": 0.0, + "step": 8170, + "text_loss": 0.4009707272052765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 38.36630466686234, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014010322348767057, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 13178012.0, + "repeat_count": 0.0, + "routers_loss": 0.01619168184697628, + "skip_count": 3.0, + "step": 8172, + "text_loss": 0.29182371497154236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00013988842974730137, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13181096.0, + "repeat_count": 0.0, + "routers_loss": 0.0037969043478369713, + "skip_count": 0.0, + "step": 8174, + "text_loss": 0.275851845741272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00013967377400236515, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13184116.0, + "repeat_count": 0.0, + "routers_loss": 0.0007759644067846239, + "skip_count": 0.0, + "step": 8176, + "text_loss": 0.7569663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00013945925633511848, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13187319.0, + "repeat_count": 0.0, + "routers_loss": 0.002708743792027235, + "skip_count": 0.0, + "step": 8178, + "text_loss": 0.4733831286430359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00013924487682776492, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 13190796.0, + "repeat_count": 0.0, + "routers_loss": 0.0005060714902356267, + "skip_count": 0.0, + "step": 8180, + "text_loss": 0.5663171410560608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.413266803639566, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001390306355624551, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 13193705.0, + "repeat_count": 0.0, + "routers_loss": 0.02932601235806942, + "skip_count": 1.0, + "step": 8182, + "text_loss": 0.30700045824050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001388165326212867, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13196393.0, + "repeat_count": 0.0, + "routers_loss": 0.0011637522839009762, + "skip_count": 0.0, + "step": 8184, + "text_loss": 0.6897354125976562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00013860256808630427, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13199526.0, + "repeat_count": 0.0, + "routers_loss": 0.0017184355529025197, + "skip_count": 0.0, + "step": 8186, + "text_loss": 0.6246579885482788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00013838874203949954, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13202963.0, + "repeat_count": 0.0, + "routers_loss": 0.0026622721925377846, + "skip_count": 0.0, + "step": 8188, + "text_loss": 0.506066083908081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013817505456281099, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13207408.0, + "repeat_count": 0.0, + "routers_loss": 0.000543750764336437, + "skip_count": 0.0, + "step": 8190, + "text_loss": 0.5192428231239319 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001379615057381241, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13211073.0, + "repeat_count": 0.0, + "routers_loss": 0.0010060713393613696, + "skip_count": 0.0, + "step": 8192, + "text_loss": 0.5640166401863098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00013774809564727104, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13214203.0, + "repeat_count": 0.0, + "routers_loss": 0.005152868572622538, + "skip_count": 2.0, + "step": 8194, + "text_loss": 0.8643819689750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001375348243720312, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13217748.0, + "repeat_count": 0.0, + "routers_loss": 0.0017722113989293575, + "skip_count": 2.0, + "step": 8196, + "text_loss": 0.40500834584236145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001373216919941304, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 13221341.0, + "repeat_count": 1.0, + "routers_loss": 0.00999271310865879, + "skip_count": 3.0, + "step": 8198, + "text_loss": 0.2317391037940979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00013710869859524143, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13224288.0, + "repeat_count": 0.0, + "routers_loss": 0.0016836341237649322, + "skip_count": 0.0, + "step": 8200, + "text_loss": 0.31873467564582825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00013689584425698376, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13227342.0, + "repeat_count": 0.0, + "routers_loss": 0.002255793660879135, + "skip_count": 0.0, + "step": 8202, + "text_loss": 0.13513202965259552 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001366831290609235, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 13230912.0, + "repeat_count": 1.0, + "routers_loss": 0.0062925987876951694, + "skip_count": 4.0, + "step": 8204, + "text_loss": 0.3692396581172943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00013647055308857353, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13233961.0, + "repeat_count": 1.0, + "routers_loss": 0.0020471401512622833, + "skip_count": 0.0, + "step": 8206, + "text_loss": 0.5655510425567627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001362581164213934, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13237170.0, + "repeat_count": 0.0, + "routers_loss": 0.0009666495025157928, + "skip_count": 0.0, + "step": 8208, + "text_loss": 0.720582902431488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00013604581914078922, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13241020.0, + "repeat_count": 0.0, + "routers_loss": 0.0006306356517598033, + "skip_count": 0.0, + "step": 8210, + "text_loss": 0.5686481595039368 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.55415321397123, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00013583366132811374, + "loss": 0.0058, + "macro_f1": 0.5492662787437439, + "num_tokens": 13244491.0, + "repeat_count": 2.0, + "routers_loss": 0.016230134293437004, + "skip_count": 0.0, + "step": 8212, + "text_loss": 0.55678790807724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00013562164306466624, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13247551.0, + "repeat_count": 0.0, + "routers_loss": 0.003904943587258458, + "skip_count": 2.0, + "step": 8214, + "text_loss": 0.6521575450897217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00013540976443169244, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13250863.0, + "repeat_count": 0.0, + "routers_loss": 0.002239734400063753, + "skip_count": 1.0, + "step": 8216, + "text_loss": 0.29757481813430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00013519802551038452, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13254215.0, + "repeat_count": 0.0, + "routers_loss": 0.004978829529136419, + "skip_count": 2.0, + "step": 8218, + "text_loss": 0.30598193407058716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00013498642638188157, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13257269.0, + "repeat_count": 0.0, + "routers_loss": 0.0040260558016598225, + "skip_count": 0.0, + "step": 8220, + "text_loss": 0.39327144622802734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00013477496712726862, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13260573.0, + "repeat_count": 0.0, + "routers_loss": 0.002124674618244171, + "skip_count": 0.0, + "step": 8222, + "text_loss": 0.38342708349227905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00013456364782757718, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13263684.0, + "repeat_count": 0.0, + "routers_loss": 0.00087209593039006, + "skip_count": 0.0, + "step": 8224, + "text_loss": 0.6338301301002502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00013435246856378526, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13266879.0, + "repeat_count": 1.0, + "routers_loss": 0.003183641703799367, + "skip_count": 0.0, + "step": 8226, + "text_loss": 0.6073583364486694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.00013414142941681718, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13270679.0, + "repeat_count": 0.0, + "routers_loss": 0.001859338372014463, + "skip_count": 0.0, + "step": 8228, + "text_loss": 0.5427029132843018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0001339305304675435, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13273275.0, + "repeat_count": 0.0, + "routers_loss": 0.000655558833386749, + "skip_count": 0.0, + "step": 8230, + "text_loss": 0.29442915320396423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00013371977179678113, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13276205.0, + "repeat_count": 0.0, + "routers_loss": 0.0011499621905386448, + "skip_count": 0.0, + "step": 8232, + "text_loss": 0.5601125359535217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00013350915348529313, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13279242.0, + "repeat_count": 0.0, + "routers_loss": 0.0019823790062218904, + "skip_count": 0.0, + "step": 8234, + "text_loss": 0.43674135208129883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00013329867561378888, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 13282531.0, + "repeat_count": 0.0, + "routers_loss": 0.005772443953901529, + "skip_count": 3.0, + "step": 8236, + "text_loss": 0.4838809072971344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013308833826292395, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13286219.0, + "repeat_count": 0.0, + "routers_loss": 0.0038314659614115953, + "skip_count": 2.0, + "step": 8238, + "text_loss": 0.5002569556236267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 38.685647196947464, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.031005859375, + "learning_rate": 0.00013287814151329987, + "loss": 0.0075, + "macro_f1": 0.9452888369560242, + "num_tokens": 13290348.0, + "repeat_count": 1.0, + "routers_loss": 0.04819172993302345, + "skip_count": 4.0, + "step": 8240, + "text_loss": 0.3099883198738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00013266808544546438, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13293644.0, + "repeat_count": 0.0, + "routers_loss": 0.010334883816540241, + "skip_count": 2.0, + "step": 8242, + "text_loss": 0.17672912776470184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00013245817013991164, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 13296721.0, + "repeat_count": 0.0, + "routers_loss": 0.00162201386410743, + "skip_count": 0.0, + "step": 8244, + "text_loss": 0.7664286494255066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00013224839567708142, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13299704.0, + "repeat_count": 0.0, + "routers_loss": 0.0039452011696994305, + "skip_count": 0.0, + "step": 8246, + "text_loss": 0.1827820986509323 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 38.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00013203876213735972, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13302553.0, + "repeat_count": 1.0, + "routers_loss": 0.006701917387545109, + "skip_count": 7.0, + "step": 8248, + "text_loss": 0.6020278930664062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0001318292696010785, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13305875.0, + "repeat_count": 0.0, + "routers_loss": 0.00968079548329115, + "skip_count": 2.0, + "step": 8250, + "text_loss": 0.2693248987197876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00013161991814851571, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13309115.0, + "repeat_count": 2.0, + "routers_loss": 0.008890608325600624, + "skip_count": 2.0, + "step": 8252, + "text_loss": 0.6325297355651855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00013141070785989517, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13312219.0, + "repeat_count": 1.0, + "routers_loss": 0.00825794693082571, + "skip_count": 4.0, + "step": 8254, + "text_loss": 0.284396767616272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00013120163881538677, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 13315214.0, + "repeat_count": 0.0, + "routers_loss": 0.003378969384357333, + "skip_count": 1.0, + "step": 8256, + "text_loss": 0.20296992361545563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.77017904314646, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00013099271109510603, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13319117.0, + "repeat_count": 1.0, + "routers_loss": 0.0164186954498291, + "skip_count": 0.0, + "step": 8258, + "text_loss": 0.21940068900585175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0001307839247791145, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13321631.0, + "repeat_count": 0.0, + "routers_loss": 0.0053979759104549885, + "skip_count": 3.0, + "step": 8260, + "text_loss": 0.19442199170589447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00013057527994741946, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13324759.0, + "repeat_count": 0.0, + "routers_loss": 0.0024567479267716408, + "skip_count": 0.0, + "step": 8262, + "text_loss": 0.5528824925422668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001303667766799741, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13327554.0, + "repeat_count": 0.0, + "routers_loss": 0.002819873159751296, + "skip_count": 1.0, + "step": 8264, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00013015841505667703, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 13331838.0, + "repeat_count": 0.0, + "routers_loss": 0.0030280952341854572, + "skip_count": 1.0, + "step": 8266, + "text_loss": 0.5263079404830933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001299501951573731, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13334968.0, + "repeat_count": 0.0, + "routers_loss": 0.001774887670762837, + "skip_count": 4.0, + "step": 8268, + "text_loss": 0.47985130548477173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012974211706185247, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 13338052.0, + "repeat_count": 0.0, + "routers_loss": 0.007027842104434967, + "skip_count": 1.0, + "step": 8270, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00012953418084985107, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13341653.0, + "repeat_count": 0.0, + "routers_loss": 0.0026854060124605894, + "skip_count": 1.0, + "step": 8272, + "text_loss": 0.43156498670578003 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012932638660105038, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 13345173.0, + "repeat_count": 0.0, + "routers_loss": 0.0033325920812785625, + "skip_count": 0.0, + "step": 8274, + "text_loss": 0.1679086685180664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00012911873439507766, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13348635.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183287370949984, + "skip_count": 0.0, + "step": 8276, + "text_loss": 0.5907418131828308 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00012891122431150549, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13351120.0, + "repeat_count": 0.0, + "routers_loss": 0.0049970983527600765, + "skip_count": 1.0, + "step": 8278, + "text_loss": 0.5437678694725037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.00012870385642985222, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13353774.0, + "repeat_count": 0.0, + "routers_loss": 0.0027123154141008854, + "skip_count": 0.0, + "step": 8280, + "text_loss": 0.5742796659469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00012849663082958158, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13358236.0, + "repeat_count": 0.0, + "routers_loss": 0.0062842960469424725, + "skip_count": 0.0, + "step": 8282, + "text_loss": 0.2340863049030304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012828954759010265, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13360994.0, + "repeat_count": 0.0, + "routers_loss": 0.0006564505747519433, + "skip_count": 0.0, + "step": 8284, + "text_loss": 0.45432794094085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001280826067907705, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13363665.0, + "repeat_count": 0.0, + "routers_loss": 0.001298630959354341, + "skip_count": 0.0, + "step": 8286, + "text_loss": 0.7439755201339722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00012787580851088493, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 13367412.0, + "repeat_count": 0.0, + "routers_loss": 0.00464112963527441, + "skip_count": 0.0, + "step": 8288, + "text_loss": 0.2854461669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001276691528296916, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13370745.0, + "repeat_count": 0.0, + "routers_loss": 0.0006090773968026042, + "skip_count": 0.0, + "step": 8290, + "text_loss": 0.6663011312484741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00012746263982638123, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13373396.0, + "repeat_count": 0.0, + "routers_loss": 0.0038922233507037163, + "skip_count": 0.0, + "step": 8292, + "text_loss": 0.3858443796634674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012725626958009007, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13376172.0, + "repeat_count": 0.0, + "routers_loss": 0.0016941255889832973, + "skip_count": 0.0, + "step": 8294, + "text_loss": 0.4758119285106659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0001270500421698994, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13379002.0, + "repeat_count": 1.0, + "routers_loss": 0.001703770598396659, + "skip_count": 0.0, + "step": 8296, + "text_loss": 0.7464606165885925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00012684395767483626, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13382221.0, + "repeat_count": 0.0, + "routers_loss": 0.001474690856412053, + "skip_count": 1.0, + "step": 8298, + "text_loss": 0.37309199571609497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00012663801617387245, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13385276.0, + "repeat_count": 0.0, + "routers_loss": 0.004561704583466053, + "skip_count": 3.0, + "step": 8300, + "text_loss": 0.43284836411476135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 38.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00012643221774592518, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13388321.0, + "repeat_count": 2.0, + "routers_loss": 0.005136100109666586, + "skip_count": 1.0, + "step": 8302, + "text_loss": 0.669730007648468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00012622656246985675, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 13391222.0, + "repeat_count": 0.0, + "routers_loss": 0.0028521555941551924, + "skip_count": 0.0, + "step": 8304, + "text_loss": 0.16773155331611633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00012602105042447471, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13395297.0, + "repeat_count": 0.0, + "routers_loss": 0.0033424890134483576, + "skip_count": 2.0, + "step": 8306, + "text_loss": 0.1650846153497696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001258156816885316, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13398482.0, + "repeat_count": 0.0, + "routers_loss": 0.0012481207959353924, + "skip_count": 0.0, + "step": 8308, + "text_loss": 0.37225499749183655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00012561045634072515, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13402199.0, + "repeat_count": 0.0, + "routers_loss": 0.006243644282221794, + "skip_count": 3.0, + "step": 8310, + "text_loss": 0.16000206768512726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00012540537445969807, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13404950.0, + "repeat_count": 0.0, + "routers_loss": 0.004267443902790546, + "skip_count": 2.0, + "step": 8312, + "text_loss": 0.400174081325531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00012520043612403815, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13407883.0, + "repeat_count": 0.0, + "routers_loss": 0.005013707559555769, + "skip_count": 2.0, + "step": 8314, + "text_loss": 0.1331731230020523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00012499564141227798, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13410563.0, + "repeat_count": 1.0, + "routers_loss": 0.00463570561259985, + "skip_count": 0.0, + "step": 8316, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0001247909904028956, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 13413730.0, + "repeat_count": 1.0, + "routers_loss": 0.007066591177135706, + "skip_count": 1.0, + "step": 8318, + "text_loss": 0.8059925436973572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012458648317431348, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13416425.0, + "repeat_count": 0.0, + "routers_loss": 0.004210594110190868, + "skip_count": 3.0, + "step": 8320, + "text_loss": 0.6559522151947021 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001243821198048992, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13419851.0, + "repeat_count": 1.0, + "routers_loss": 0.005613257177174091, + "skip_count": 2.0, + "step": 8322, + "text_loss": 0.2783811688423157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012417790037296523, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 13422588.0, + "repeat_count": 0.0, + "routers_loss": 0.00233642989769578, + "skip_count": 1.0, + "step": 8324, + "text_loss": 0.7659147381782532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00012397382495676874, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13425275.0, + "repeat_count": 0.0, + "routers_loss": 0.0013295465614646673, + "skip_count": 0.0, + "step": 8326, + "text_loss": 0.5693745017051697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0001237698936345119, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13428314.0, + "repeat_count": 1.0, + "routers_loss": 0.005712272133678198, + "skip_count": 1.0, + "step": 8328, + "text_loss": 0.8581340909004211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012356610648434153, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13431453.0, + "repeat_count": 0.0, + "routers_loss": 0.0015835616504773498, + "skip_count": 0.0, + "step": 8330, + "text_loss": 0.1395341008901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012336246358434928, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13434566.0, + "repeat_count": 0.0, + "routers_loss": 0.0012973316479474306, + "skip_count": 0.0, + "step": 8332, + "text_loss": 0.7125005125999451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012315896501257145, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13438056.0, + "repeat_count": 0.0, + "routers_loss": 0.0005822008824907243, + "skip_count": 0.0, + "step": 8334, + "text_loss": 0.7730510234832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00012295561084698915, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13441390.0, + "repeat_count": 0.0, + "routers_loss": 0.00547185679897666, + "skip_count": 1.0, + "step": 8336, + "text_loss": 0.3927873373031616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000122752401165528, + "loss": 0.0022, + "macro_f1": 0.3333333432674408, + "num_tokens": 13443864.0, + "repeat_count": 0.0, + "routers_loss": 0.0011191967641934752, + "skip_count": 0.0, + "step": 8338, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012254933604605828, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13447070.0, + "repeat_count": 0.0, + "routers_loss": 0.0005196621641516685, + "skip_count": 0.0, + "step": 8340, + "text_loss": 0.5597847104072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00012234641556639508, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13450522.0, + "repeat_count": 0.0, + "routers_loss": 0.003857341594994068, + "skip_count": 2.0, + "step": 8342, + "text_loss": 0.14400488138198853 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00012214363980429793, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 13453578.0, + "repeat_count": 1.0, + "routers_loss": 0.006664265412837267, + "skip_count": 3.0, + "step": 8344, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.00012194100883747078, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 13456480.0, + "repeat_count": 0.0, + "routers_loss": 0.003549816319718957, + "skip_count": 0.0, + "step": 8346, + "text_loss": 0.21776801347732544 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00012173852274356217, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13459859.0, + "repeat_count": 1.0, + "routers_loss": 0.00446992926299572, + "skip_count": 3.0, + "step": 8348, + "text_loss": 0.1828736811876297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00012153618160016527, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13463104.0, + "repeat_count": 0.0, + "routers_loss": 0.0024826989974826574, + "skip_count": 1.0, + "step": 8350, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001213339854848175, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 13467051.0, + "repeat_count": 0.0, + "routers_loss": 0.0021385846193879843, + "skip_count": 1.0, + "step": 8352, + "text_loss": 0.49281737208366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00012113193447500081, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13470411.0, + "repeat_count": 0.0, + "routers_loss": 0.0014382716035470366, + "skip_count": 1.0, + "step": 8354, + "text_loss": 0.5984349846839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00012093002864814151, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13474666.0, + "repeat_count": 0.0, + "routers_loss": 0.008536498062312603, + "skip_count": 1.0, + "step": 8356, + "text_loss": 0.2851131856441498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00012072826808161036, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13477754.0, + "repeat_count": 0.0, + "routers_loss": 0.0027286717668175697, + "skip_count": 0.0, + "step": 8358, + "text_loss": 0.5987376570701599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001205266528527223, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13481151.0, + "repeat_count": 0.0, + "routers_loss": 0.002780565759167075, + "skip_count": 1.0, + "step": 8360, + "text_loss": 0.1847199648618698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00012032518303873674, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13484050.0, + "repeat_count": 0.0, + "routers_loss": 0.0006186611135490239, + "skip_count": 0.0, + "step": 8362, + "text_loss": 0.6229772567749023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 39.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00012012385871685716, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13488551.0, + "repeat_count": 0.0, + "routers_loss": 0.00956071075052023, + "skip_count": 5.0, + "step": 8364, + "text_loss": 0.2810790538787842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00011992267996423162, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13491420.0, + "repeat_count": 0.0, + "routers_loss": 0.008410792797803879, + "skip_count": 2.0, + "step": 8366, + "text_loss": 0.20509617030620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00011972164685795212, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 13494736.0, + "repeat_count": 0.0, + "routers_loss": 0.00762166129425168, + "skip_count": 1.0, + "step": 8368, + "text_loss": 0.24739402532577515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.295861461696504, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011952075947505486, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 13498363.0, + "repeat_count": 0.0, + "routers_loss": 0.010674391873180866, + "skip_count": 1.0, + "step": 8370, + "text_loss": 0.31931644678115845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 39.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0001193200178925204, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 13501029.0, + "repeat_count": 2.0, + "routers_loss": 0.0041843741200864315, + "skip_count": 1.0, + "step": 8372, + "text_loss": 0.5103049278259277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00011911942218727312, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13503854.0, + "repeat_count": 0.0, + "routers_loss": 0.0006344785797409713, + "skip_count": 0.0, + "step": 8374, + "text_loss": 0.4914432764053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00011891897243618183, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13508316.0, + "repeat_count": 0.0, + "routers_loss": 0.0003527739318087697, + "skip_count": 0.0, + "step": 8376, + "text_loss": 0.5317551493644714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00011871866871605913, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13512603.0, + "repeat_count": 0.0, + "routers_loss": 0.001071247854270041, + "skip_count": 0.0, + "step": 8378, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00011851851110366185, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 13515928.0, + "repeat_count": 0.0, + "routers_loss": 0.000924977008253336, + "skip_count": 1.0, + "step": 8380, + "text_loss": 0.8004939556121826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0001183184996756908, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13518548.0, + "repeat_count": 0.0, + "routers_loss": 0.0017637151759117842, + "skip_count": 0.0, + "step": 8382, + "text_loss": 0.5012105107307434 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00011811863450879063, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 13522155.0, + "repeat_count": 2.0, + "routers_loss": 0.0011129514314234257, + "skip_count": 0.0, + "step": 8384, + "text_loss": 0.3866073489189148 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.371000880540066, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00011791891567955009, + "loss": 0.0046, + "macro_f1": 0.8814815282821655, + "num_tokens": 13525352.0, + "repeat_count": 2.0, + "routers_loss": 0.042801812291145325, + "skip_count": 4.0, + "step": 8386, + "text_loss": 0.18817944824695587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.00011771934326450173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13528537.0, + "repeat_count": 0.0, + "routers_loss": 0.0006869474309496582, + "skip_count": 0.0, + "step": 8388, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00011751991734012229, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13531650.0, + "repeat_count": 0.0, + "routers_loss": 0.0008001072565093637, + "skip_count": 0.0, + "step": 8390, + "text_loss": 0.5149344205856323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00011732063798283204, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13535071.0, + "repeat_count": 0.0, + "routers_loss": 0.0006921148742549121, + "skip_count": 0.0, + "step": 8392, + "text_loss": 0.5906356573104858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00011712150526899523, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 13537741.0, + "repeat_count": 0.0, + "routers_loss": 0.005221226718276739, + "skip_count": 2.0, + "step": 8394, + "text_loss": 0.3381146192550659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00011692251927491987, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 13541189.0, + "repeat_count": 1.0, + "routers_loss": 0.0023983579594641924, + "skip_count": 1.0, + "step": 8396, + "text_loss": 0.7345486283302307 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00011672368007685774, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 13545210.0, + "repeat_count": 1.0, + "routers_loss": 0.005362956319004297, + "skip_count": 2.0, + "step": 8398, + "text_loss": 0.6522865295410156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00011652498775100445, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13548260.0, + "repeat_count": 0.0, + "routers_loss": 0.002955642296001315, + "skip_count": 0.0, + "step": 8400, + "text_loss": 0.3200102150440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00011632644237349927, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13551519.0, + "repeat_count": 0.0, + "routers_loss": 0.001079231034964323, + "skip_count": 0.0, + "step": 8402, + "text_loss": 0.7251807451248169 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00011612804402042509, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13555241.0, + "repeat_count": 1.0, + "routers_loss": 0.013860360719263554, + "skip_count": 0.0, + "step": 8404, + "text_loss": 0.159539595246315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 39.46492515409451, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.054931640625, + "learning_rate": 0.00011592979276780857, + "loss": 0.0055, + "macro_f1": 0.9555556178092957, + "num_tokens": 13558389.0, + "repeat_count": 1.0, + "routers_loss": 0.017025530338287354, + "skip_count": 5.0, + "step": 8406, + "text_loss": 0.5154430270195007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011573168869162004, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 13561237.0, + "repeat_count": 1.0, + "routers_loss": 0.007349071092903614, + "skip_count": 2.0, + "step": 8408, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00011553373186777327, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13564080.0, + "repeat_count": 1.0, + "routers_loss": 0.003303215140476823, + "skip_count": 2.0, + "step": 8410, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00011533592237212558, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 13566649.0, + "repeat_count": 0.0, + "routers_loss": 0.005856195464730263, + "skip_count": 1.0, + "step": 8412, + "text_loss": 0.28037169575691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001151382602804782, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13570015.0, + "repeat_count": 0.0, + "routers_loss": 0.0007515792385675013, + "skip_count": 0.0, + "step": 8414, + "text_loss": 0.8517835736274719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00011494074566857549, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13573262.0, + "repeat_count": 0.0, + "routers_loss": 0.0043421462178230286, + "skip_count": 0.0, + "step": 8416, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00011474337861210544, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13576104.0, + "repeat_count": 1.0, + "routers_loss": 0.0108594736084342, + "skip_count": 2.0, + "step": 8418, + "text_loss": 0.4724268317222595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.53067214558262, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00011454615918669948, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 13579138.0, + "repeat_count": 1.0, + "routers_loss": 0.04178442806005478, + "skip_count": 0.0, + "step": 8420, + "text_loss": 0.4065103530883789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00011434908746793238, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13582818.0, + "repeat_count": 0.0, + "routers_loss": 0.004756448790431023, + "skip_count": 2.0, + "step": 8422, + "text_loss": 0.2932167947292328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00011415216353132252, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13586261.0, + "repeat_count": 0.0, + "routers_loss": 0.0033427432645112276, + "skip_count": 1.0, + "step": 8424, + "text_loss": 0.47670233249664307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0001139553874523313, + "loss": 0.003, + "macro_f1": 0.6666666865348816, + "num_tokens": 13589765.0, + "repeat_count": 0.0, + "routers_loss": 0.006597383879125118, + "skip_count": 1.0, + "step": 8426, + "text_loss": 0.31448885798454285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.5682418550044, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.00011375875930636403, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13592741.0, + "repeat_count": 0.0, + "routers_loss": 0.011398134753108025, + "skip_count": 1.0, + "step": 8428, + "text_loss": 0.17429469525814056 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.00011356227916876877, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13595763.0, + "repeat_count": 1.0, + "routers_loss": 0.0038021153304725885, + "skip_count": 0.0, + "step": 8430, + "text_loss": 0.6043882966041565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00011336594711483712, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13598274.0, + "repeat_count": 0.0, + "routers_loss": 0.00044314167462289333, + "skip_count": 0.0, + "step": 8432, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00011316976321980388, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13601510.0, + "repeat_count": 0.0, + "routers_loss": 0.001956664025783539, + "skip_count": 0.0, + "step": 8434, + "text_loss": 0.48483794927597046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0001129737275588471, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13604410.0, + "repeat_count": 0.0, + "routers_loss": 0.005170237272977829, + "skip_count": 0.0, + "step": 8436, + "text_loss": 0.21759741008281708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011277784020708803, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13607207.0, + "repeat_count": 1.0, + "routers_loss": 0.002223948948085308, + "skip_count": 2.0, + "step": 8438, + "text_loss": 0.6877034306526184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00011258210123959089, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13610981.0, + "repeat_count": 0.0, + "routers_loss": 0.0017733481945469975, + "skip_count": 1.0, + "step": 8440, + "text_loss": 0.7250658273696899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00011238651073136358, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 13614194.0, + "repeat_count": 1.0, + "routers_loss": 0.00155889883171767, + "skip_count": 1.0, + "step": 8442, + "text_loss": 0.6742649078369141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00011219106875735652, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13618011.0, + "repeat_count": 0.0, + "routers_loss": 0.0011234934208914638, + "skip_count": 0.0, + "step": 8444, + "text_loss": 0.8105526566505432 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.65277370120341, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00011199577539246347, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13621852.0, + "repeat_count": 1.0, + "routers_loss": 0.02346695400774479, + "skip_count": 1.0, + "step": 8446, + "text_loss": 0.22664032876491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001118006307115213, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 13624711.0, + "repeat_count": 0.0, + "routers_loss": 0.012819754891097546, + "skip_count": 2.0, + "step": 8448, + "text_loss": 0.31696105003356934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00011160563478930969, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13627561.0, + "repeat_count": 0.0, + "routers_loss": 0.0060531035996973515, + "skip_count": 2.0, + "step": 8450, + "text_loss": 0.2935826778411865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00011141078770055152, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13630445.0, + "repeat_count": 0.0, + "routers_loss": 0.004288572818040848, + "skip_count": 0.0, + "step": 8452, + "text_loss": 0.5720692873001099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00011121608951991252, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13633496.0, + "repeat_count": 0.0, + "routers_loss": 0.005682424642145634, + "skip_count": 1.0, + "step": 8454, + "text_loss": 0.28466710448265076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00011102154032200146, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13635938.0, + "repeat_count": 0.0, + "routers_loss": 0.0009555552969686687, + "skip_count": 0.0, + "step": 8456, + "text_loss": 0.47744694352149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00011082714018136985, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13638863.0, + "repeat_count": 0.0, + "routers_loss": 0.0023627313785254955, + "skip_count": 0.0, + "step": 8458, + "text_loss": 0.5212090611457825 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00011063288917251235, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13641874.0, + "repeat_count": 1.0, + "routers_loss": 0.00791920255869627, + "skip_count": 2.0, + "step": 8460, + "text_loss": 0.31359919905662537 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00011043878736986607, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13644970.0, + "repeat_count": 1.0, + "routers_loss": 0.0033252311404794455, + "skip_count": 1.0, + "step": 8462, + "text_loss": 0.33621230721473694 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00011024483484781144, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13648103.0, + "repeat_count": 1.0, + "routers_loss": 0.005567418877035379, + "skip_count": 2.0, + "step": 8464, + "text_loss": 0.48708856105804443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011005103168067143, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13651085.0, + "repeat_count": 0.0, + "routers_loss": 0.00047958645154722035, + "skip_count": 0.0, + "step": 8466, + "text_loss": 0.4151248633861542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00010985737794271161, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13654175.0, + "repeat_count": 0.0, + "routers_loss": 0.0009806647431105375, + "skip_count": 0.0, + "step": 8468, + "text_loss": 0.7322396039962769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00010966387370814057, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13657058.0, + "repeat_count": 0.0, + "routers_loss": 0.0009820344857871532, + "skip_count": 0.0, + "step": 8470, + "text_loss": 0.6350769400596619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010947051905110945, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13660203.0, + "repeat_count": 2.0, + "routers_loss": 0.002065197564661503, + "skip_count": 0.0, + "step": 8472, + "text_loss": 0.6025850176811218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010927731404571211, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13664021.0, + "repeat_count": 0.0, + "routers_loss": 0.0009939799783751369, + "skip_count": 0.0, + "step": 8474, + "text_loss": 0.3040087819099426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0001090842587659851, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13667055.0, + "repeat_count": 0.0, + "routers_loss": 0.0008282510680146515, + "skip_count": 0.0, + "step": 8476, + "text_loss": 0.7306531667709351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001088913532859076, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13669940.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349589770659804, + "skip_count": 0.0, + "step": 8478, + "text_loss": 0.32041916251182556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010869859767940133, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13672955.0, + "repeat_count": 0.0, + "routers_loss": 0.0007435405277647078, + "skip_count": 0.0, + "step": 8480, + "text_loss": 0.5343614816665649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00010850599202033051, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13676173.0, + "repeat_count": 0.0, + "routers_loss": 0.002763360273092985, + "skip_count": 0.0, + "step": 8482, + "text_loss": 0.6071668267250061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010831353638250213, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13680121.0, + "repeat_count": 0.0, + "routers_loss": 0.00202178000472486, + "skip_count": 0.0, + "step": 8484, + "text_loss": 0.42487844824790955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010812123083966535, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13683504.0, + "repeat_count": 0.0, + "routers_loss": 0.0056348275393247604, + "skip_count": 1.0, + "step": 8486, + "text_loss": 0.17678795754909515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010792907546551229, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13686870.0, + "repeat_count": 0.0, + "routers_loss": 0.003331703832373023, + "skip_count": 0.0, + "step": 8488, + "text_loss": 0.32238465547561646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00010773707033367708, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13690429.0, + "repeat_count": 0.0, + "routers_loss": 0.0011620528530329466, + "skip_count": 0.0, + "step": 8490, + "text_loss": 0.4141998291015625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00010754521551773655, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 13693747.0, + "repeat_count": 1.0, + "routers_loss": 0.005236583761870861, + "skip_count": 0.0, + "step": 8492, + "text_loss": 0.557283878326416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 39.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00010735351109120972, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13696837.0, + "repeat_count": 0.0, + "routers_loss": 0.005507425405085087, + "skip_count": 6.0, + "step": 8494, + "text_loss": 0.7394861578941345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00010716195712755821, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13700080.0, + "repeat_count": 0.0, + "routers_loss": 0.0008621517335996032, + "skip_count": 0.0, + "step": 8496, + "text_loss": 0.7079368233680725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010697055370018572, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13704088.0, + "repeat_count": 0.0, + "routers_loss": 0.0004489862476475537, + "skip_count": 0.0, + "step": 8498, + "text_loss": 0.5672308206558228 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00010677930088243847, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13707391.0, + "repeat_count": 1.0, + "routers_loss": 0.009171495214104652, + "skip_count": 2.0, + "step": 8500, + "text_loss": 0.6851600408554077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00010658819874760495, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13711238.0, + "repeat_count": 0.0, + "routers_loss": 0.0016714727971702814, + "skip_count": 1.0, + "step": 8502, + "text_loss": 0.7102733850479126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00010639724736891576, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13714553.0, + "repeat_count": 0.0, + "routers_loss": 0.0012916292762383819, + "skip_count": 0.0, + "step": 8504, + "text_loss": 0.4234752953052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0001062064468195439, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13718046.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265420186333358, + "skip_count": 0.0, + "step": 8506, + "text_loss": 0.5576326251029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001060157971726045, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13720687.0, + "repeat_count": 0.0, + "routers_loss": 0.0023503501433879137, + "skip_count": 1.0, + "step": 8508, + "text_loss": 0.5259605646133423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00010582529850115469, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13723946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593657355755568, + "skip_count": 0.0, + "step": 8510, + "text_loss": 0.3795129954814911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.00010563495087819419, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13727589.0, + "repeat_count": 0.0, + "routers_loss": 0.0005672222469002008, + "skip_count": 0.0, + "step": 8512, + "text_loss": 0.685897946357727 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.972116231288524, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00010544475437666445, + "loss": 0.0049, + "macro_f1": 0.9262410998344421, + "num_tokens": 13730579.0, + "repeat_count": 3.0, + "routers_loss": 0.01708158478140831, + "skip_count": 2.0, + "step": 8514, + "text_loss": 0.8044925332069397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00010525470906944917, + "loss": 0.0113, + "macro_f1": 1.0, + "num_tokens": 13733563.0, + "repeat_count": 1.0, + "routers_loss": 0.010253295302391052, + "skip_count": 2.0, + "step": 8516, + "text_loss": 0.3999447524547577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00010506481502937398, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13736645.0, + "repeat_count": 0.0, + "routers_loss": 0.004293019883334637, + "skip_count": 0.0, + "step": 8518, + "text_loss": 0.3128681778907776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00010487507232920674, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13740080.0, + "repeat_count": 1.0, + "routers_loss": 0.0030790462624281645, + "skip_count": 1.0, + "step": 8520, + "text_loss": 0.39142900705337524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00010468548104165709, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13743085.0, + "repeat_count": 0.0, + "routers_loss": 0.0007342757890000939, + "skip_count": 0.0, + "step": 8522, + "text_loss": 0.7652465105056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.00010449604123937689, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13746513.0, + "repeat_count": 0.0, + "routers_loss": 0.0030496022664010525, + "skip_count": 0.0, + "step": 8524, + "text_loss": 0.6259746551513672 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010430675299495973, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13749391.0, + "repeat_count": 1.0, + "routers_loss": 0.010060965083539486, + "skip_count": 1.0, + "step": 8526, + "text_loss": 0.2266668826341629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0001041176163809413, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13752449.0, + "repeat_count": 1.0, + "routers_loss": 0.002234962536022067, + "skip_count": 2.0, + "step": 8528, + "text_loss": 0.9742465019226074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00010392863146979903, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13755572.0, + "repeat_count": 0.0, + "routers_loss": 0.0003572004789020866, + "skip_count": 0.0, + "step": 8530, + "text_loss": 0.5757357478141785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010373979833395242, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 13759198.0, + "repeat_count": 0.0, + "routers_loss": 0.011161680333316326, + "skip_count": 0.0, + "step": 8532, + "text_loss": 0.6268131136894226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00010355111704576236, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13761914.0, + "repeat_count": 0.0, + "routers_loss": 0.002053353004157543, + "skip_count": 0.0, + "step": 8534, + "text_loss": 0.22388778626918793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00010336258767753232, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13765371.0, + "repeat_count": 0.0, + "routers_loss": 0.003634720342233777, + "skip_count": 2.0, + "step": 8536, + "text_loss": 0.5802993178367615 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.084531846199, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00010317421030150692, + "loss": 0.0072, + "macro_f1": 0.9539539813995361, + "num_tokens": 13768276.0, + "repeat_count": 5.0, + "routers_loss": 0.053806692361831665, + "skip_count": 5.0, + "step": 8538, + "text_loss": 0.10888377577066422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.00010298598498987266, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13772369.0, + "repeat_count": 0.0, + "routers_loss": 0.00501362606883049, + "skip_count": 1.0, + "step": 8540, + "text_loss": 0.5794995427131653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00010279791181475795, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 13776595.0, + "repeat_count": 1.0, + "routers_loss": 0.002230882178992033, + "skip_count": 2.0, + "step": 8542, + "text_loss": 0.5503702163696289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00010260999084823264, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13779993.0, + "repeat_count": 0.0, + "routers_loss": 0.0012205395614728332, + "skip_count": 0.0, + "step": 8544, + "text_loss": 0.7248672842979431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00010242222216230856, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13782683.0, + "repeat_count": 0.0, + "routers_loss": 0.0003966465883422643, + "skip_count": 0.0, + "step": 8546, + "text_loss": 0.7446619272232056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00010223460582893889, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13785534.0, + "repeat_count": 0.0, + "routers_loss": 0.004968565888702869, + "skip_count": 1.0, + "step": 8548, + "text_loss": 0.22457796335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00010204714192001863, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13788608.0, + "repeat_count": 0.0, + "routers_loss": 0.0033054195810109377, + "skip_count": 2.0, + "step": 8550, + "text_loss": 0.418837308883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00010185983050738434, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13791553.0, + "repeat_count": 0.0, + "routers_loss": 0.001166256028227508, + "skip_count": 0.0, + "step": 8552, + "text_loss": 0.4060337543487549 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00010167267166281402, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13795304.0, + "repeat_count": 0.0, + "routers_loss": 0.003844029037281871, + "skip_count": 2.0, + "step": 8554, + "text_loss": 0.17412975430488586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00010148566545802718, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13798445.0, + "repeat_count": 0.0, + "routers_loss": 0.0033507589250802994, + "skip_count": 0.0, + "step": 8556, + "text_loss": 0.24744336307048798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00010129881196468527, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13801338.0, + "repeat_count": 0.0, + "routers_loss": 0.004076482728123665, + "skip_count": 0.0, + "step": 8558, + "text_loss": 0.6542767882347107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00010111211125439069, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 13804157.0, + "repeat_count": 0.0, + "routers_loss": 0.0005654391716234386, + "skip_count": 0.0, + "step": 8560, + "text_loss": 0.527079701423645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00010092556339868758, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13807411.0, + "repeat_count": 0.0, + "routers_loss": 0.004915264435112476, + "skip_count": 1.0, + "step": 8562, + "text_loss": 0.721017599105835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010073916846906139, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13810489.0, + "repeat_count": 0.0, + "routers_loss": 0.005571382585912943, + "skip_count": 1.0, + "step": 8564, + "text_loss": 0.5802517533302307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00010055292653693903, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13813526.0, + "repeat_count": 0.0, + "routers_loss": 0.001321605988778174, + "skip_count": 0.0, + "step": 8566, + "text_loss": 0.5485247373580933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00010036683767368859, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 13817225.0, + "repeat_count": 0.0, + "routers_loss": 0.001876185997389257, + "skip_count": 0.0, + "step": 8568, + "text_loss": 0.08957820385694504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00010018090195061997, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13820667.0, + "repeat_count": 0.0, + "routers_loss": 0.004593426361680031, + "skip_count": 0.0, + "step": 8570, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.999511943898398e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0022372701205313206, + "skip_count": 0.0, + "step": 8572, + "text_loss": 0.20976831018924713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.980949020997276e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13827623.0, + "repeat_count": 0.0, + "routers_loss": 0.0030519715510308743, + "skip_count": 0.0, + "step": 8574, + "text_loss": 0.7638732194900513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.962401433471985e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13831013.0, + "repeat_count": 0.0, + "routers_loss": 0.005036211106926203, + "skip_count": 1.0, + "step": 8576, + "text_loss": 0.3791790306568146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.943869188429989e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13833611.0, + "repeat_count": 0.0, + "routers_loss": 0.002071794355288148, + "skip_count": 2.0, + "step": 8578, + "text_loss": 0.5480846166610718 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 9.925352292972884e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13836678.0, + "repeat_count": 1.0, + "routers_loss": 0.008119060657918453, + "skip_count": 0.0, + "step": 8580, + "text_loss": 0.21605457365512848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 9.906850754196379e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13839255.0, + "repeat_count": 0.0, + "routers_loss": 0.004017427563667297, + "skip_count": 2.0, + "step": 8582, + "text_loss": 0.4473285973072052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 9.888364579190285e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13842034.0, + "repeat_count": 0.0, + "routers_loss": 0.005163116846233606, + "skip_count": 1.0, + "step": 8584, + "text_loss": 0.21627424657344818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.869893775038557e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13844648.0, + "repeat_count": 0.0, + "routers_loss": 0.0044358340092003345, + "skip_count": 1.0, + "step": 8586, + "text_loss": 0.5660704970359802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.851438348819247e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13847629.0, + "repeat_count": 0.0, + "routers_loss": 0.00038135924842208624, + "skip_count": 1.0, + "step": 8588, + "text_loss": 0.6401235461235046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 9.832998307604495e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13851409.0, + "repeat_count": 0.0, + "routers_loss": 0.004005341790616512, + "skip_count": 1.0, + "step": 8590, + "text_loss": 0.43975043296813965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 9.814573658460562e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13854031.0, + "repeat_count": 0.0, + "routers_loss": 0.006872966885566711, + "skip_count": 2.0, + "step": 8592, + "text_loss": 0.6000451445579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.796164408447811e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 13856813.0, + "repeat_count": 0.0, + "routers_loss": 0.0019872859120368958, + "skip_count": 0.0, + "step": 8594, + "text_loss": 0.6026073098182678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 9.777770564620698e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13859805.0, + "repeat_count": 0.0, + "routers_loss": 0.013098123483359814, + "skip_count": 2.0, + "step": 8596, + "text_loss": 0.3294500708580017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 9.759392134027783e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13863119.0, + "repeat_count": 1.0, + "routers_loss": 0.001011171261779964, + "skip_count": 1.0, + "step": 8598, + "text_loss": 0.4078965187072754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.741029123711708e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13866239.0, + "repeat_count": 0.0, + "routers_loss": 0.003267963184043765, + "skip_count": 0.0, + "step": 8600, + "text_loss": 0.5064641833305359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.385089521573235, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 9.722681540709228e-05, + "loss": 0.0045, + "macro_f1": 0.6601307392120361, + "num_tokens": 13869647.0, + "repeat_count": 1.0, + "routers_loss": 0.02431299351155758, + "skip_count": 2.0, + "step": 8602, + "text_loss": 0.2512950301170349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 9.704349392051155e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13873128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019577480852603912, + "skip_count": 1.0, + "step": 8604, + "text_loss": 0.425156831741333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 9.686032684762408e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13876603.0, + "repeat_count": 0.0, + "routers_loss": 0.001554530463181436, + "skip_count": 1.0, + "step": 8606, + "text_loss": 0.3596082329750061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 9.667731425861975e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13879602.0, + "repeat_count": 0.0, + "routers_loss": 0.0027400986291468143, + "skip_count": 0.0, + "step": 8608, + "text_loss": 0.12101534754037857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.649445622362957e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13882204.0, + "repeat_count": 0.0, + "routers_loss": 0.001957559958100319, + "skip_count": 2.0, + "step": 8610, + "text_loss": 0.382834255695343 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 9.631175281272491e-05, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13886397.0, + "repeat_count": 1.0, + "routers_loss": 0.009613300673663616, + "skip_count": 3.0, + "step": 8612, + "text_loss": 0.24718235433101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 9.612920409591813e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13889625.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159029280766845, + "skip_count": 0.0, + "step": 8614, + "text_loss": 0.406452476978302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.59468101431622e-05, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 13892518.0, + "repeat_count": 0.0, + "routers_loss": 0.008069832809269428, + "skip_count": 3.0, + "step": 8616, + "text_loss": 0.19740329682826996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 9.576457102435082e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13895822.0, + "repeat_count": 0.0, + "routers_loss": 0.0024340536911040545, + "skip_count": 0.0, + "step": 8618, + "text_loss": 0.44761306047439575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 9.558248680931841e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13898829.0, + "repeat_count": 2.0, + "routers_loss": 0.0053517078049480915, + "skip_count": 1.0, + "step": 8620, + "text_loss": 0.37335118651390076 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.47901379512768, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.540055756783994e-05, + "loss": 0.0061, + "macro_f1": 0.9255813956260681, + "num_tokens": 13902122.0, + "repeat_count": 3.0, + "routers_loss": 0.03885587304830551, + "skip_count": 4.0, + "step": 8622, + "text_loss": 0.21311092376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 9.521878336963108e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13904874.0, + "repeat_count": 0.0, + "routers_loss": 0.007965708151459694, + "skip_count": 1.0, + "step": 8624, + "text_loss": 0.27229398488998413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 9.5037164284348e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13907755.0, + "repeat_count": 0.0, + "routers_loss": 0.0019825168419629335, + "skip_count": 0.0, + "step": 8626, + "text_loss": 0.6535577178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.507191077194015, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 9.485570038158747e-05, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 13910619.0, + "repeat_count": 1.0, + "routers_loss": 0.017803344875574112, + "skip_count": 0.0, + "step": 8628, + "text_loss": 0.26617178320884705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.467439173088687e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 13914098.0, + "repeat_count": 0.0, + "routers_loss": 0.0025836096610873938, + "skip_count": 0.0, + "step": 8630, + "text_loss": 0.44465285539627075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.44932384017238e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13917192.0, + "repeat_count": 0.0, + "routers_loss": 0.004438584204763174, + "skip_count": 2.0, + "step": 8632, + "text_loss": 0.33622798323631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 9.431224046351688e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 13920067.0, + "repeat_count": 0.0, + "routers_loss": 0.017312567681074142, + "skip_count": 2.0, + "step": 8634, + "text_loss": 0.31870952248573303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 9.413139798562476e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13922887.0, + "repeat_count": 0.0, + "routers_loss": 0.0019389945082366467, + "skip_count": 0.0, + "step": 8636, + "text_loss": 0.18223261833190918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 9.395071103734648e-05, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13926545.0, + "repeat_count": 0.0, + "routers_loss": 0.0011485094437375665, + "skip_count": 0.0, + "step": 8638, + "text_loss": 0.48031774163246155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.377017968792179e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13931171.0, + "repeat_count": 1.0, + "routers_loss": 0.003448521951213479, + "skip_count": 0.0, + "step": 8640, + "text_loss": 0.7585139870643616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 40.57293806868213, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0213623046875, + "learning_rate": 9.35898040065305e-05, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 13934369.0, + "repeat_count": 0.0, + "routers_loss": 0.017959754914045334, + "skip_count": 2.0, + "step": 8642, + "text_loss": 0.49708613753318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 9.3409584062293e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13938166.0, + "repeat_count": 0.0, + "routers_loss": 0.004092653747648001, + "skip_count": 1.0, + "step": 8644, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.322951992426992e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13941922.0, + "repeat_count": 0.0, + "routers_loss": 0.0026206092443317175, + "skip_count": 0.0, + "step": 8646, + "text_loss": 0.4735889434814453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.304961166146209e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 13945569.0, + "repeat_count": 3.0, + "routers_loss": 0.005156307481229305, + "skip_count": 2.0, + "step": 8648, + "text_loss": 0.5630270838737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 9.286985934281079e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13948357.0, + "repeat_count": 0.0, + "routers_loss": 0.004913610871881247, + "skip_count": 1.0, + "step": 8650, + "text_loss": 0.4053497016429901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 9.26902630371974e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13952543.0, + "repeat_count": 0.0, + "routers_loss": 0.003946282435208559, + "skip_count": 2.0, + "step": 8652, + "text_loss": 0.40166863799095154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.251082281344358e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13955917.0, + "repeat_count": 0.0, + "routers_loss": 0.0009605551022104919, + "skip_count": 0.0, + "step": 8654, + "text_loss": 0.20477983355522156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.233153874031102e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13960071.0, + "repeat_count": 0.0, + "routers_loss": 0.004408199340105057, + "skip_count": 3.0, + "step": 8656, + "text_loss": 0.3349814713001251 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 9.215241088650194e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13963125.0, + "repeat_count": 1.0, + "routers_loss": 0.005541396792978048, + "skip_count": 2.0, + "step": 8658, + "text_loss": 0.6602919697761536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 9.197343932065843e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13966130.0, + "repeat_count": 0.0, + "routers_loss": 0.001636760076507926, + "skip_count": 0.0, + "step": 8660, + "text_loss": 0.7704628109931946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 9.179462411136263e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13969791.0, + "repeat_count": 0.0, + "routers_loss": 0.0006453761598095298, + "skip_count": 0.0, + "step": 8662, + "text_loss": 0.3898075520992279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 9.161596532713695e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 13972987.0, + "repeat_count": 0.0, + "routers_loss": 0.005081792362034321, + "skip_count": 4.0, + "step": 8664, + "text_loss": 0.8477506041526794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.143746303644374e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13976505.0, + "repeat_count": 0.0, + "routers_loss": 0.0032063762191683054, + "skip_count": 0.0, + "step": 8666, + "text_loss": 0.23729658126831055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 9.125911730768543e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13980061.0, + "repeat_count": 0.0, + "routers_loss": 0.00043821477447636425, + "skip_count": 0.0, + "step": 8668, + "text_loss": 0.4233637750148773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 9.108092820920438e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13983407.0, + "repeat_count": 0.0, + "routers_loss": 0.007779054809361696, + "skip_count": 2.0, + "step": 8670, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 9.090289580928307e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13986725.0, + "repeat_count": 0.0, + "routers_loss": 0.0018697676714509726, + "skip_count": 1.0, + "step": 8672, + "text_loss": 1.0568488836288452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 9.072502017614382e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 13990765.0, + "repeat_count": 0.0, + "routers_loss": 0.002077789744362235, + "skip_count": 0.0, + "step": 8674, + "text_loss": 0.48911142349243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.73260933372468, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.054730137794887e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 13994083.0, + "repeat_count": 1.0, + "routers_loss": 0.044373031705617905, + "skip_count": 3.0, + "step": 8676, + "text_loss": 0.3420281708240509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 9.036973948280048e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13997500.0, + "repeat_count": 0.0, + "routers_loss": 0.0015431724023073912, + "skip_count": 0.0, + "step": 8678, + "text_loss": 0.21514096856117249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.019233455874049e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 14000460.0, + "repeat_count": 0.0, + "routers_loss": 0.006088062655180693, + "skip_count": 1.0, + "step": 8680, + "text_loss": 0.43932875990867615 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 9.001508667375107e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14003537.0, + "repeat_count": 2.0, + "routers_loss": 0.01006145216524601, + "skip_count": 3.0, + "step": 8682, + "text_loss": 0.2192728966474533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 8.983799589575393e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14005943.0, + "repeat_count": 0.0, + "routers_loss": 0.001044525415636599, + "skip_count": 0.0, + "step": 8684, + "text_loss": 0.8686383962631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.96610622926104e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14008954.0, + "repeat_count": 0.0, + "routers_loss": 0.004876079503446817, + "skip_count": 2.0, + "step": 8686, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.948428593212193e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14012268.0, + "repeat_count": 1.0, + "routers_loss": 0.007909095846116543, + "skip_count": 2.0, + "step": 8688, + "text_loss": 0.17117907106876373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 8.930766688202946e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14015192.0, + "repeat_count": 0.0, + "routers_loss": 0.0022194553166627884, + "skip_count": 0.0, + "step": 8690, + "text_loss": 0.637697160243988 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 8.913120521001383e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14018055.0, + "repeat_count": 1.0, + "routers_loss": 0.0023777696769684553, + "skip_count": 0.0, + "step": 8692, + "text_loss": 0.39099860191345215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.895490098369535e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14021035.0, + "repeat_count": 0.0, + "routers_loss": 0.002676652278751135, + "skip_count": 1.0, + "step": 8694, + "text_loss": 0.6112156510353088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 8.877875427063431e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14023759.0, + "repeat_count": 0.0, + "routers_loss": 0.001040685223415494, + "skip_count": 0.0, + "step": 8696, + "text_loss": 0.3562681972980499 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 8.86027651383302e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14026090.0, + "repeat_count": 1.0, + "routers_loss": 0.0011444527190178633, + "skip_count": 0.0, + "step": 8698, + "text_loss": 0.6152632236480713 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.84531846199002, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 8.842693365422266e-05, + "loss": 0.008, + "macro_f1": 0.8817967176437378, + "num_tokens": 14029570.0, + "repeat_count": 2.0, + "routers_loss": 0.024327632039785385, + "skip_count": 3.0, + "step": 8700, + "text_loss": 0.2170596867799759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.825125988569061e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14032418.0, + "repeat_count": 0.0, + "routers_loss": 0.00048010432510636747, + "skip_count": 0.0, + "step": 8702, + "text_loss": 0.4421340525150299 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 8.807574390005241e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14035610.0, + "repeat_count": 0.0, + "routers_loss": 0.0010498231276869774, + "skip_count": 0.0, + "step": 8704, + "text_loss": 0.3656717538833618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 8.790038576456627e-05, + "loss": 0.0045, + "macro_f1": 0.3272727429866791, + "num_tokens": 14039354.0, + "repeat_count": 0.0, + "routers_loss": 0.019302964210510254, + "skip_count": 1.0, + "step": 8706, + "text_loss": 0.6150856018066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 8.772518554642972e-05, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 14042353.0, + "repeat_count": 0.0, + "routers_loss": 0.004211598541587591, + "skip_count": 0.0, + "step": 8708, + "text_loss": 0.17178772389888763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 8.755014331277972e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14045704.0, + "repeat_count": 0.0, + "routers_loss": 0.0007902922225184739, + "skip_count": 0.0, + "step": 8710, + "text_loss": 0.6289885640144348 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 8.737525913069277e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 14048743.0, + "repeat_count": 1.0, + "routers_loss": 0.007915202528238297, + "skip_count": 2.0, + "step": 8712, + "text_loss": 0.2778690457344055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 8.720053306718506e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14052762.0, + "repeat_count": 0.0, + "routers_loss": 0.0027877227403223515, + "skip_count": 3.0, + "step": 8714, + "text_loss": 0.3615926504135132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.92045788083358, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 8.702596518921175e-05, + "loss": 0.0086, + "macro_f1": 0.6603773832321167, + "num_tokens": 14056645.0, + "repeat_count": 1.0, + "routers_loss": 0.03460995852947235, + "skip_count": 1.0, + "step": 8716, + "text_loss": 0.19412031769752502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 8.685155556366763e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14059604.0, + "repeat_count": 1.0, + "routers_loss": 0.0026834046002477407, + "skip_count": 2.0, + "step": 8718, + "text_loss": 0.4414670169353485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.667730425738679e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14062170.0, + "repeat_count": 0.0, + "routers_loss": 0.01547359861433506, + "skip_count": 4.0, + "step": 8720, + "text_loss": 0.2850716710090637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 8.650321133714267e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 14065526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020194994285702705, + "skip_count": 0.0, + "step": 8722, + "text_loss": 0.1776508241891861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 8.632927686964798e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14068525.0, + "repeat_count": 0.0, + "routers_loss": 0.0037195945624262094, + "skip_count": 0.0, + "step": 8724, + "text_loss": 0.2786005735397339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 8.615550092155477e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 14071830.0, + "repeat_count": 1.0, + "routers_loss": 0.008169961161911488, + "skip_count": 4.0, + "step": 8726, + "text_loss": 0.43228310346603394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 8.598188355945424e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14074977.0, + "repeat_count": 0.0, + "routers_loss": 0.006407112814486027, + "skip_count": 1.0, + "step": 8728, + "text_loss": 0.24443474411964417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 8.580842484987689e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14078104.0, + "repeat_count": 0.0, + "routers_loss": 0.001878641895018518, + "skip_count": 1.0, + "step": 8730, + "text_loss": 0.4559098184108734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 8.563512485929253e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14081934.0, + "repeat_count": 0.0, + "routers_loss": 0.0056114462204277515, + "skip_count": 0.0, + "step": 8732, + "text_loss": 0.3063429594039917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 8.546198365411007e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14085097.0, + "repeat_count": 1.0, + "routers_loss": 0.001542840269394219, + "skip_count": 0.0, + "step": 8734, + "text_loss": 0.7624274492263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 8.528900130067741e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14088630.0, + "repeat_count": 0.0, + "routers_loss": 0.002677374053746462, + "skip_count": 0.0, + "step": 8736, + "text_loss": 0.18395234644412994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 8.511617786528175e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14091513.0, + "repeat_count": 1.0, + "routers_loss": 0.004059800878167152, + "skip_count": 0.0, + "step": 8738, + "text_loss": 0.4567817449569702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 8.494351341414947e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 14094500.0, + "repeat_count": 1.0, + "routers_loss": 0.0023724427446722984, + "skip_count": 1.0, + "step": 8740, + "text_loss": 0.6925744414329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0155029296875, + "learning_rate": 8.477100801344573e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14097518.0, + "repeat_count": 0.0, + "routers_loss": 0.0013842503540217876, + "skip_count": 2.0, + "step": 8742, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.459866172927505e-05, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 14101219.0, + "repeat_count": 0.0, + "routers_loss": 0.003597316099330783, + "skip_count": 2.0, + "step": 8744, + "text_loss": 0.785912036895752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 41.061050777810394, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.027099609375, + "learning_rate": 8.442647462768082e-05, + "loss": 0.0066, + "macro_f1": 0.6225374937057495, + "num_tokens": 14104460.0, + "repeat_count": 0.0, + "routers_loss": 0.01929798349738121, + "skip_count": 5.0, + "step": 8746, + "text_loss": 0.2111714482307434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 8.425444677464545e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14107404.0, + "repeat_count": 0.0, + "routers_loss": 0.00048497592797502875, + "skip_count": 0.0, + "step": 8748, + "text_loss": 0.4764930307865143 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 8.408257823609033e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14109917.0, + "repeat_count": 1.0, + "routers_loss": 0.007886217907071114, + "skip_count": 2.0, + "step": 8750, + "text_loss": 0.2771969735622406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 8.391086907787587e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14112649.0, + "repeat_count": 0.0, + "routers_loss": 0.006535434629768133, + "skip_count": 0.0, + "step": 8752, + "text_loss": 0.1550854742527008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 8.373931936580114e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14116044.0, + "repeat_count": 0.0, + "routers_loss": 0.002130605047568679, + "skip_count": 0.0, + "step": 8754, + "text_loss": 0.4055478870868683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 8.356792916560457e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14119097.0, + "repeat_count": 0.0, + "routers_loss": 0.0005611231899820268, + "skip_count": 0.0, + "step": 8756, + "text_loss": 0.47804903984069824 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 8.339669854296316e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14122079.0, + "repeat_count": 2.0, + "routers_loss": 0.005650801584124565, + "skip_count": 0.0, + "step": 8758, + "text_loss": 0.1968296617269516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 8.322562756349273e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14124910.0, + "repeat_count": 0.0, + "routers_loss": 0.0035948604345321655, + "skip_count": 1.0, + "step": 8760, + "text_loss": 0.4988253712654114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 8.305471629274802e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 14127767.0, + "repeat_count": 0.0, + "routers_loss": 0.0012090947711840272, + "skip_count": 0.0, + "step": 8762, + "text_loss": 0.6330704689025879 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 8.288396479622262e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14130766.0, + "repeat_count": 0.0, + "routers_loss": 0.0010853242129087448, + "skip_count": 1.0, + "step": 8764, + "text_loss": 0.43057000637054443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.271337313934868e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14133804.0, + "repeat_count": 0.0, + "routers_loss": 0.0037055034190416336, + "skip_count": 2.0, + "step": 8766, + "text_loss": 0.31973564624786377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 8.254294138749741e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14137164.0, + "repeat_count": 0.0, + "routers_loss": 0.005338407587260008, + "skip_count": 0.0, + "step": 8768, + "text_loss": 0.5066531896591187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.237266960597844e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14140119.0, + "repeat_count": 0.0, + "routers_loss": 0.0014707009540870786, + "skip_count": 1.0, + "step": 8770, + "text_loss": 0.553493857383728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 8.220255786004033e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14143223.0, + "repeat_count": 0.0, + "routers_loss": 0.002113121096044779, + "skip_count": 0.0, + "step": 8772, + "text_loss": 0.40016281604766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 8.203260621487019e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14146366.0, + "repeat_count": 0.0, + "routers_loss": 0.002210963051766157, + "skip_count": 1.0, + "step": 8774, + "text_loss": 0.44022905826568604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 8.186281473559382e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14150009.0, + "repeat_count": 0.0, + "routers_loss": 0.0011857844656333327, + "skip_count": 0.0, + "step": 8776, + "text_loss": 0.572823703289032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 8.169318348727544e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14153343.0, + "repeat_count": 0.0, + "routers_loss": 0.0020397785119712353, + "skip_count": 1.0, + "step": 8778, + "text_loss": 0.5724276900291443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 8.152371253491841e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14156392.0, + "repeat_count": 0.0, + "routers_loss": 0.001745635992847383, + "skip_count": 0.0, + "step": 8780, + "text_loss": 0.14162923395633698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 8.135440194346416e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 14159616.0, + "repeat_count": 0.0, + "routers_loss": 0.002799858106300235, + "skip_count": 0.0, + "step": 8782, + "text_loss": 0.18205340206623077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 8.118525177779284e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14163531.0, + "repeat_count": 1.0, + "routers_loss": 0.0029223538003861904, + "skip_count": 0.0, + "step": 8784, + "text_loss": 0.4107058644294739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 8.101626210272311e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14166776.0, + "repeat_count": 0.0, + "routers_loss": 0.001209643087349832, + "skip_count": 0.0, + "step": 8786, + "text_loss": 0.6441596746444702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 8.084743298301211e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 14169586.0, + "repeat_count": 0.0, + "routers_loss": 0.0015196573222056031, + "skip_count": 0.0, + "step": 8788, + "text_loss": 0.35585930943489075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 8.067876448335549e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14174180.0, + "repeat_count": 0.0, + "routers_loss": 0.0004388966190163046, + "skip_count": 0.0, + "step": 8790, + "text_loss": 0.31594613194465637 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 8.05102566683873e-05, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 14177950.0, + "repeat_count": 1.0, + "routers_loss": 0.0031201441306620836, + "skip_count": 0.0, + "step": 8792, + "text_loss": 0.3161006569862366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 8.034190960268012e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14180642.0, + "repeat_count": 0.0, + "routers_loss": 0.001848527928814292, + "skip_count": 0.0, + "step": 8794, + "text_loss": 0.47571417689323425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 8.017372335074486e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14183743.0, + "repeat_count": 0.0, + "routers_loss": 0.0043064444325864315, + "skip_count": 1.0, + "step": 8796, + "text_loss": 0.5976942777633667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 8.000569797703072e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14187742.0, + "repeat_count": 0.0, + "routers_loss": 0.005383181851357222, + "skip_count": 2.0, + "step": 8798, + "text_loss": 0.2692606449127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 7.983783354592544e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14191211.0, + "repeat_count": 0.0, + "routers_loss": 0.001401974936015904, + "skip_count": 0.0, + "step": 8800, + "text_loss": 0.38108205795288086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 7.967013012175478e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14194992.0, + "repeat_count": 0.0, + "routers_loss": 0.001168998540379107, + "skip_count": 0.0, + "step": 8802, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 7.950258776878332e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14198059.0, + "repeat_count": 0.0, + "routers_loss": 0.0032015808392316103, + "skip_count": 2.0, + "step": 8804, + "text_loss": 0.6014752984046936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 7.933520655121351e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14202313.0, + "repeat_count": 0.0, + "routers_loss": 0.0009403078584000468, + "skip_count": 0.0, + "step": 8806, + "text_loss": 0.54194176197052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.916798653318607e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14205534.0, + "repeat_count": 0.0, + "routers_loss": 0.0027781077660620213, + "skip_count": 1.0, + "step": 8808, + "text_loss": 0.7181227803230286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 7.900092777878004e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14209357.0, + "repeat_count": 0.0, + "routers_loss": 0.0034586815163493156, + "skip_count": 1.0, + "step": 8810, + "text_loss": 0.21651209890842438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.883403035201265e-05, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 14212328.0, + "repeat_count": 1.0, + "routers_loss": 0.01194343063980341, + "skip_count": 4.0, + "step": 8812, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0157470703125, + "learning_rate": 7.866729431683938e-05, + "loss": 0.0038, + "macro_f1": 1.0, + "num_tokens": 14214979.0, + "repeat_count": 1.0, + "routers_loss": 0.0045132869854569435, + "skip_count": 1.0, + "step": 8814, + "text_loss": 0.4066837728023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0181884765625, + "learning_rate": 7.850071973715368e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14219030.0, + "repeat_count": 0.0, + "routers_loss": 0.005109346006065607, + "skip_count": 2.0, + "step": 8816, + "text_loss": 0.12459450960159302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 7.833430667678737e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14222117.0, + "repeat_count": 0.0, + "routers_loss": 0.0036401136312633753, + "skip_count": 0.0, + "step": 8818, + "text_loss": 0.3759046494960785 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 7.816805519951008e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14225546.0, + "repeat_count": 2.0, + "routers_loss": 0.006177824921905994, + "skip_count": 1.0, + "step": 8820, + "text_loss": 0.4031941592693329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 41.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 7.800196536902987e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14228731.0, + "repeat_count": 0.0, + "routers_loss": 0.009549650363624096, + "skip_count": 5.0, + "step": 8822, + "text_loss": 0.2895966172218323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 7.783603724899258e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14231796.0, + "repeat_count": 0.0, + "routers_loss": 0.005532847251743078, + "skip_count": 2.0, + "step": 8824, + "text_loss": 0.32433390617370605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 7.767027090298206e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14235869.0, + "repeat_count": 0.0, + "routers_loss": 0.0011165215400978923, + "skip_count": 0.0, + "step": 8826, + "text_loss": 0.41239091753959656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.750466639452059e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14238830.0, + "repeat_count": 0.0, + "routers_loss": 0.0007845646468922496, + "skip_count": 0.0, + "step": 8828, + "text_loss": 0.5113243460655212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.733922378706787e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14241672.0, + "repeat_count": 0.0, + "routers_loss": 0.0029602700378745794, + "skip_count": 1.0, + "step": 8830, + "text_loss": 0.22004501521587372 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 7.717394314402199e-05, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 14244522.0, + "repeat_count": 2.0, + "routers_loss": 0.005297200754284859, + "skip_count": 1.0, + "step": 8832, + "text_loss": 0.6039504408836365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 7.700882452871872e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14246964.0, + "repeat_count": 0.0, + "routers_loss": 0.0018059068825095892, + "skip_count": 2.0, + "step": 8834, + "text_loss": 0.46563026309013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.684386800443177e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14249387.0, + "repeat_count": 0.0, + "routers_loss": 0.005659483838826418, + "skip_count": 2.0, + "step": 8836, + "text_loss": 0.31516948342323303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 7.667907363437288e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14252438.0, + "repeat_count": 0.0, + "routers_loss": 0.011170750483870506, + "skip_count": 1.0, + "step": 8838, + "text_loss": 0.22867503762245178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 7.651444148169157e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14255490.0, + "repeat_count": 0.0, + "routers_loss": 0.004106760956346989, + "skip_count": 2.0, + "step": 8840, + "text_loss": 0.5757828950881958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 7.634997160947499e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14258430.0, + "repeat_count": 0.0, + "routers_loss": 0.0008562540751881897, + "skip_count": 0.0, + "step": 8842, + "text_loss": 0.5166661143302917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 7.618566408074862e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14261275.0, + "repeat_count": 0.0, + "routers_loss": 0.0012901517329737544, + "skip_count": 0.0, + "step": 8844, + "text_loss": 0.7376981973648071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 7.602151895847526e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14264698.0, + "repeat_count": 0.0, + "routers_loss": 0.00267209205776453, + "skip_count": 0.0, + "step": 8846, + "text_loss": 0.5249470472335815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 41.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 7.585753630555565e-05, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 14267887.0, + "repeat_count": 1.0, + "routers_loss": 0.015334542840719223, + "skip_count": 7.0, + "step": 8848, + "text_loss": 1.1539889574050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 7.569371618482818e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14271392.0, + "repeat_count": 0.0, + "routers_loss": 0.0010222389828413725, + "skip_count": 0.0, + "step": 8850, + "text_loss": 0.33968010544776917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 7.553005865906914e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 14274658.0, + "repeat_count": 0.0, + "routers_loss": 0.0006116362637840211, + "skip_count": 0.0, + "step": 8852, + "text_loss": 0.7514221668243408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 7.536656379099221e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14277763.0, + "repeat_count": 0.0, + "routers_loss": 0.0036474792286753654, + "skip_count": 0.0, + "step": 8854, + "text_loss": 0.3964846134185791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 7.520323164324921e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14281165.0, + "repeat_count": 0.0, + "routers_loss": 0.005498840939253569, + "skip_count": 1.0, + "step": 8856, + "text_loss": 0.2235594391822815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 7.504006227842919e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14284761.0, + "repeat_count": 2.0, + "routers_loss": 0.006513409782201052, + "skip_count": 0.0, + "step": 8858, + "text_loss": 0.45196816325187683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.48770557590589e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14287844.0, + "repeat_count": 0.0, + "routers_loss": 0.0013065916718915105, + "skip_count": 0.0, + "step": 8860, + "text_loss": 0.2188033014535904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 7.471421214760287e-05, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 14291280.0, + "repeat_count": 1.0, + "routers_loss": 0.0016644994029775262, + "skip_count": 0.0, + "step": 8862, + "text_loss": 0.7049906253814697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 7.455153150646299e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14294330.0, + "repeat_count": 1.0, + "routers_loss": 0.002664943691343069, + "skip_count": 0.0, + "step": 8864, + "text_loss": 0.2160239815711975 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 7.43890138979788e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 14298355.0, + "repeat_count": 1.0, + "routers_loss": 0.0035776710137724876, + "skip_count": 0.0, + "step": 8866, + "text_loss": 0.4922088384628296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.422665938442741e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 14301452.0, + "repeat_count": 0.0, + "routers_loss": 0.0029914912302047014, + "skip_count": 2.0, + "step": 8868, + "text_loss": 0.5828475952148438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 7.406446802802331e-05, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 14304667.0, + "repeat_count": 1.0, + "routers_loss": 0.0010031569981947541, + "skip_count": 2.0, + "step": 8870, + "text_loss": 0.657244861125946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 7.390243989091849e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14307397.0, + "repeat_count": 0.0, + "routers_loss": 0.007960405200719833, + "skip_count": 1.0, + "step": 8872, + "text_loss": 0.3147352635860443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 7.37405750352026e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14310687.0, + "repeat_count": 1.0, + "routers_loss": 0.007953251712024212, + "skip_count": 3.0, + "step": 8874, + "text_loss": 0.30315887928009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 7.357887352290227e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 14314007.0, + "repeat_count": 0.0, + "routers_loss": 0.0012103051412850618, + "skip_count": 0.0, + "step": 8876, + "text_loss": 0.6356115341186523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.341733541598217e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 14316696.0, + "repeat_count": 0.0, + "routers_loss": 0.0017898730002343655, + "skip_count": 1.0, + "step": 8878, + "text_loss": 0.35877764225006104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 7.325596077634383e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14320172.0, + "repeat_count": 0.0, + "routers_loss": 0.0007144945557229221, + "skip_count": 0.0, + "step": 8880, + "text_loss": 0.7939266562461853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 7.309474966582635e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14323262.0, + "repeat_count": 0.0, + "routers_loss": 0.001255290349945426, + "skip_count": 0.0, + "step": 8882, + "text_loss": 0.7115976810455322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 7.293370214620616e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14326826.0, + "repeat_count": 0.0, + "routers_loss": 0.0028131126891821623, + "skip_count": 2.0, + "step": 8884, + "text_loss": 0.24073036015033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 7.277281827919691e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14329658.0, + "repeat_count": 0.0, + "routers_loss": 0.0024797592777758837, + "skip_count": 1.0, + "step": 8886, + "text_loss": 0.47276070713996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.72791312004696, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.26120981264496e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 14333584.0, + "repeat_count": 1.0, + "routers_loss": 0.023670634254813194, + "skip_count": 3.0, + "step": 8888, + "text_loss": 0.47537583112716675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 7.245154174955254e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14336850.0, + "repeat_count": 0.0, + "routers_loss": 0.0009583478095009923, + "skip_count": 0.0, + "step": 8890, + "text_loss": 0.5258943438529968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 7.229114921003116e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14339940.0, + "repeat_count": 0.0, + "routers_loss": 0.006664840504527092, + "skip_count": 3.0, + "step": 8892, + "text_loss": 0.20986922085285187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 7.213092056934833e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14342737.0, + "repeat_count": 0.0, + "routers_loss": 0.0005362578085623682, + "skip_count": 0.0, + "step": 8894, + "text_loss": 0.5174402594566345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 7.197085588890383e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14345769.0, + "repeat_count": 0.0, + "routers_loss": 0.006428950000554323, + "skip_count": 1.0, + "step": 8896, + "text_loss": 0.657136857509613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.181095523003478e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14348563.0, + "repeat_count": 1.0, + "routers_loss": 0.0015549053205177188, + "skip_count": 0.0, + "step": 8898, + "text_loss": 0.49799686670303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 7.165121865401535e-05, + "loss": 0.0068, + "macro_f1": 0.32098764181137085, + "num_tokens": 14353134.0, + "repeat_count": 0.0, + "routers_loss": 0.030110027641057968, + "skip_count": 2.0, + "step": 8900, + "text_loss": 0.3644331693649292 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.149164622205712e-05, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 14356031.0, + "repeat_count": 1.0, + "routers_loss": 0.0014812488807365298, + "skip_count": 1.0, + "step": 8902, + "text_loss": 0.46983054280281067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.133223799530836e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14358941.0, + "repeat_count": 0.0, + "routers_loss": 0.001170543720945716, + "skip_count": 0.0, + "step": 8904, + "text_loss": 0.7030026316642761 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 7.117299403485466e-05, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 14361807.0, + "repeat_count": 1.0, + "routers_loss": 0.0011649372754618526, + "skip_count": 1.0, + "step": 8906, + "text_loss": 0.44989535212516785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 7.101391440171856e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14365464.0, + "repeat_count": 0.0, + "routers_loss": 0.0028165180701762438, + "skip_count": 0.0, + "step": 8908, + "text_loss": 0.487165629863739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 7.085499915685978e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14368149.0, + "repeat_count": 0.0, + "routers_loss": 0.001956705003976822, + "skip_count": 2.0, + "step": 8910, + "text_loss": 0.3717629909515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 7.069624836117484e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14371440.0, + "repeat_count": 0.0, + "routers_loss": 0.0027164234779775143, + "skip_count": 1.0, + "step": 8912, + "text_loss": 0.3683965802192688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 7.053766207549734e-05, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 14374965.0, + "repeat_count": 0.0, + "routers_loss": 0.005999395158141851, + "skip_count": 2.0, + "step": 8914, + "text_loss": 0.6271854639053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 7.037924036059789e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14378445.0, + "repeat_count": 0.0, + "routers_loss": 0.000978486379608512, + "skip_count": 0.0, + "step": 8916, + "text_loss": 0.5927628874778748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 7.022098327718401e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14382851.0, + "repeat_count": 0.0, + "routers_loss": 0.012569266371428967, + "skip_count": 1.0, + "step": 8918, + "text_loss": 0.4092319905757904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 41.878191957734074, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 7.006289088590007e-05, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 14386959.0, + "repeat_count": 0.0, + "routers_loss": 0.011032132431864738, + "skip_count": 2.0, + "step": 8920, + "text_loss": 0.6553854942321777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 6.990496324732737e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 14390031.0, + "repeat_count": 0.0, + "routers_loss": 0.001376329455524683, + "skip_count": 0.0, + "step": 8922, + "text_loss": 0.7792862057685852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 6.974720042198396e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14392966.0, + "repeat_count": 0.0, + "routers_loss": 0.005924372002482414, + "skip_count": 2.0, + "step": 8924, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 6.958960247032515e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14395619.0, + "repeat_count": 0.0, + "routers_loss": 0.010054769925773144, + "skip_count": 2.0, + "step": 8926, + "text_loss": 0.24784758687019348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 6.943216945274255e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14398891.0, + "repeat_count": 0.0, + "routers_loss": 0.0006864808965474367, + "skip_count": 0.0, + "step": 8928, + "text_loss": 0.5154114961624146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.927490142956489e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14402991.0, + "repeat_count": 0.0, + "routers_loss": 0.000996887218207121, + "skip_count": 0.0, + "step": 8930, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 6.911779846105753e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14406276.0, + "repeat_count": 1.0, + "routers_loss": 0.0007863475475460291, + "skip_count": 0.0, + "step": 8932, + "text_loss": 0.6862632632255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 6.896086060742262e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14409005.0, + "repeat_count": 0.0, + "routers_loss": 0.0020060581155121326, + "skip_count": 1.0, + "step": 8934, + "text_loss": 0.8998132348060608 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 6.880408792879905e-05, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 14411902.0, + "repeat_count": 2.0, + "routers_loss": 0.008094016462564468, + "skip_count": 3.0, + "step": 8936, + "text_loss": 0.3411460518836975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 6.864748048526237e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14414683.0, + "repeat_count": 0.0, + "routers_loss": 0.004374993033707142, + "skip_count": 0.0, + "step": 8938, + "text_loss": 0.24222217500209808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 6.84910383368249e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14417740.0, + "repeat_count": 0.0, + "routers_loss": 0.003004335332661867, + "skip_count": 2.0, + "step": 8940, + "text_loss": 0.5524137020111084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 6.83347615434356e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14420678.0, + "repeat_count": 0.0, + "routers_loss": 0.007001105695962906, + "skip_count": 2.0, + "step": 8942, + "text_loss": 0.3124033212661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.817865016497993e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14424259.0, + "repeat_count": 0.0, + "routers_loss": 0.0038414683658629656, + "skip_count": 0.0, + "step": 8944, + "text_loss": 0.509667694568634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 6.80227042612801e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14427084.0, + "repeat_count": 1.0, + "routers_loss": 0.008573584258556366, + "skip_count": 0.0, + "step": 8946, + "text_loss": 0.2533438205718994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 6.786692389209482e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14429690.0, + "repeat_count": 1.0, + "routers_loss": 0.003758789971470833, + "skip_count": 2.0, + "step": 8948, + "text_loss": 0.14571085572242737 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 6.771130911711953e-05, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 14432983.0, + "repeat_count": 0.0, + "routers_loss": 0.005996126215904951, + "skip_count": 2.0, + "step": 8950, + "text_loss": 0.24994049966335297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.755585999598613e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 14435772.0, + "repeat_count": 0.0, + "routers_loss": 0.0012271527666598558, + "skip_count": 0.0, + "step": 8952, + "text_loss": 0.3705698549747467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 6.740057658826293e-05, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 14438912.0, + "repeat_count": 1.0, + "routers_loss": 0.0017618577694520354, + "skip_count": 1.0, + "step": 8954, + "text_loss": 0.6691124439239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 6.72454589534548e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14441959.0, + "repeat_count": 0.0, + "routers_loss": 0.0016956349136307836, + "skip_count": 1.0, + "step": 8956, + "text_loss": 0.45412346720695496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 6.709050715100324e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14444804.0, + "repeat_count": 0.0, + "routers_loss": 0.017321301624178886, + "skip_count": 2.0, + "step": 8958, + "text_loss": 0.2668265998363495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.69357212402859e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14447390.0, + "repeat_count": 0.0, + "routers_loss": 0.005267233122140169, + "skip_count": 2.0, + "step": 8960, + "text_loss": 0.35546016693115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 6.67811012806172e-05, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 14451286.0, + "repeat_count": 0.0, + "routers_loss": 0.0045175012201070786, + "skip_count": 3.0, + "step": 8962, + "text_loss": 0.14669834077358246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 6.662664733124768e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14454335.0, + "repeat_count": 1.0, + "routers_loss": 0.004905698820948601, + "skip_count": 3.0, + "step": 8964, + "text_loss": 0.28777357935905457 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.09392427355445, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 6.647235945136442e-05, + "loss": 0.0074, + "macro_f1": 0.8823530077934265, + "num_tokens": 14457708.0, + "repeat_count": 2.0, + "routers_loss": 0.032136883586645126, + "skip_count": 1.0, + "step": 8966, + "text_loss": 0.2317836582660675 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 6.631823770009088e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14460721.0, + "repeat_count": 1.0, + "routers_loss": 0.0038611628115177155, + "skip_count": 1.0, + "step": 8968, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 6.616428213648656e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14463467.0, + "repeat_count": 0.0, + "routers_loss": 0.0006560821202583611, + "skip_count": 0.0, + "step": 8970, + "text_loss": 0.3474387526512146 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.60104928195479e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14466586.0, + "repeat_count": 1.0, + "routers_loss": 0.0016879125032573938, + "skip_count": 0.0, + "step": 8972, + "text_loss": 0.5454491972923279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.58568698082071e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14470125.0, + "repeat_count": 0.0, + "routers_loss": 0.0004945555119775236, + "skip_count": 0.0, + "step": 8974, + "text_loss": 0.4728975296020508 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.570341316133272e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 14473887.0, + "repeat_count": 2.0, + "routers_loss": 0.010141569189727306, + "skip_count": 3.0, + "step": 8976, + "text_loss": 0.24756617844104767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 6.555012293772967e-05, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 14477046.0, + "repeat_count": 1.0, + "routers_loss": 0.011950359679758549, + "skip_count": 2.0, + "step": 8978, + "text_loss": 0.25375646352767944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 6.539699919613911e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14480638.0, + "repeat_count": 0.0, + "routers_loss": 0.0007824545609764755, + "skip_count": 0.0, + "step": 8980, + "text_loss": 0.6888379454612732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 6.524404199523826e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14483723.0, + "repeat_count": 0.0, + "routers_loss": 0.004318726249039173, + "skip_count": 1.0, + "step": 8982, + "text_loss": 0.3603152334690094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.17845611975345, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 6.509125139364058e-05, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 14486876.0, + "repeat_count": 0.0, + "routers_loss": 0.010652635246515274, + "skip_count": 1.0, + "step": 8984, + "text_loss": 0.43394285440444946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 6.493862744989587e-05, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 14489944.0, + "repeat_count": 0.0, + "routers_loss": 0.0010475299786776304, + "skip_count": 0.0, + "step": 8986, + "text_loss": 0.5952020287513733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 6.478617022248984e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14493094.0, + "repeat_count": 0.0, + "routers_loss": 0.004329503979533911, + "skip_count": 1.0, + "step": 8988, + "text_loss": 0.7284399271011353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 6.463387976984437e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14496944.0, + "repeat_count": 0.0, + "routers_loss": 0.0019588395953178406, + "skip_count": 1.0, + "step": 8990, + "text_loss": 0.8103306889533997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 6.448175615031749e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14499997.0, + "repeat_count": 0.0, + "routers_loss": 0.008046228438615799, + "skip_count": 1.0, + "step": 8992, + "text_loss": 0.14758773148059845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 6.432979942220319e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 14503247.0, + "repeat_count": 1.0, + "routers_loss": 0.0028899910394102335, + "skip_count": 0.0, + "step": 8994, + "text_loss": 0.2568151652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 6.417800964373161e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14506244.0, + "repeat_count": 0.0, + "routers_loss": 0.0042211092077195644, + "skip_count": 2.0, + "step": 8996, + "text_loss": 0.3506850600242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 6.402638687306872e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14510502.0, + "repeat_count": 0.0, + "routers_loss": 0.003309462917968631, + "skip_count": 0.0, + "step": 8998, + "text_loss": 0.5852319598197937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 6.387493116831699e-05, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 14513679.0, + "repeat_count": 1.0, + "routers_loss": 0.015246274881064892, + "skip_count": 5.0, + "step": 9000, + "text_loss": 0.4266709089279175 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 6.372364258751434e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 14516862.0, + "repeat_count": 2.0, + "routers_loss": 0.005648075137287378, + "skip_count": 2.0, + "step": 9002, + "text_loss": 0.34153711795806885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 6.357252118863482e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14519660.0, + "repeat_count": 0.0, + "routers_loss": 0.005153972655534744, + "skip_count": 3.0, + "step": 9004, + "text_loss": 0.3911980092525482 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 6.342156702958851e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14522261.0, + "repeat_count": 0.0, + "routers_loss": 0.001209715730510652, + "skip_count": 0.0, + "step": 9006, + "text_loss": 0.45400822162628174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 6.327078016822124e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14525368.0, + "repeat_count": 0.0, + "routers_loss": 0.00367624219506979, + "skip_count": 1.0, + "step": 9008, + "text_loss": 0.5327706336975098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 6.31201606623149e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14528253.0, + "repeat_count": 0.0, + "routers_loss": 0.0018971028039231896, + "skip_count": 0.0, + "step": 9010, + "text_loss": 0.19216643273830414 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 6.296970856958712e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14531214.0, + "repeat_count": 1.0, + "routers_loss": 0.003927265293896198, + "skip_count": 0.0, + "step": 9012, + "text_loss": 0.3931650221347809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 6.281942394769142e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14535063.0, + "repeat_count": 0.0, + "routers_loss": 0.00801338441669941, + "skip_count": 0.0, + "step": 9014, + "text_loss": 0.1605554074048996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 6.266930685421717e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14538690.0, + "repeat_count": 0.0, + "routers_loss": 0.0013267790200188756, + "skip_count": 0.0, + "step": 9016, + "text_loss": 0.4797641932964325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.251935734668957e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14542591.0, + "repeat_count": 0.0, + "routers_loss": 0.0013866537483409047, + "skip_count": 1.0, + "step": 9018, + "text_loss": 0.4539037346839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 6.236957548256945e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14545259.0, + "repeat_count": 0.0, + "routers_loss": 0.001481749233789742, + "skip_count": 0.0, + "step": 9020, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 6.22199613192535e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14548362.0, + "repeat_count": 0.0, + "routers_loss": 0.005995423533022404, + "skip_count": 1.0, + "step": 9022, + "text_loss": 0.6533607244491577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 6.207051491407428e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14551694.0, + "repeat_count": 0.0, + "routers_loss": 0.015427720732986927, + "skip_count": 4.0, + "step": 9024, + "text_loss": 0.33537840843200684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 6.192123632429986e-05, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 14554614.0, + "repeat_count": 1.0, + "routers_loss": 0.0017432396998628974, + "skip_count": 0.0, + "step": 9026, + "text_loss": 0.9725127220153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 6.177212560713413e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 14559474.0, + "repeat_count": 0.0, + "routers_loss": 0.002909898292273283, + "skip_count": 2.0, + "step": 9028, + "text_loss": 0.16944198310375214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 6.162318281971652e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14563046.0, + "repeat_count": 0.0, + "routers_loss": 0.00274385092779994, + "skip_count": 0.0, + "step": 9030, + "text_loss": 0.43176764249801636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 6.147440801912218e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14565829.0, + "repeat_count": 1.0, + "routers_loss": 0.0024230771232396364, + "skip_count": 0.0, + "step": 9032, + "text_loss": 0.5683854818344116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 6.132580126236197e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14569016.0, + "repeat_count": 0.0, + "routers_loss": 0.004686394706368446, + "skip_count": 1.0, + "step": 9034, + "text_loss": 0.5422781705856323 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 42.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 6.117736260638223e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14572558.0, + "repeat_count": 2.0, + "routers_loss": 0.0010892068967223167, + "skip_count": 1.0, + "step": 9036, + "text_loss": 0.5740243196487427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 6.102909210806495e-05, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 14575969.0, + "repeat_count": 1.0, + "routers_loss": 0.0163960512727499, + "skip_count": 0.0, + "step": 9038, + "text_loss": 0.4803958535194397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 6.088098982422768e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14578746.0, + "repeat_count": 0.0, + "routers_loss": 0.0020733694545924664, + "skip_count": 0.0, + "step": 9040, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.45083651306135, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 6.073305581162342e-05, + "loss": 0.0066, + "macro_f1": 0.6601307392120361, + "num_tokens": 14581856.0, + "repeat_count": 1.0, + "routers_loss": 0.022739989683032036, + "skip_count": 2.0, + "step": 9042, + "text_loss": 0.5871608257293701 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 6.058529012694086e-05, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 14584754.0, + "repeat_count": 1.0, + "routers_loss": 0.012138293124735355, + "skip_count": 2.0, + "step": 9044, + "text_loss": 0.18492890894412994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 6.0437692826803893e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14587867.0, + "repeat_count": 0.0, + "routers_loss": 0.0009839123813435435, + "skip_count": 0.0, + "step": 9046, + "text_loss": 0.5532476902008057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 6.029026396777237e-05, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 14591521.0, + "repeat_count": 2.0, + "routers_loss": 0.01392262615263462, + "skip_count": 5.0, + "step": 9048, + "text_loss": 0.20356278121471405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.48840622248312, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 6.0143003606341174e-05, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 14595358.0, + "repeat_count": 0.0, + "routers_loss": 0.018218200653791428, + "skip_count": 1.0, + "step": 9050, + "text_loss": 0.3070164620876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 5.9995911798940764e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14598696.0, + "repeat_count": 0.0, + "routers_loss": 0.0002688709646463394, + "skip_count": 1.0, + "step": 9052, + "text_loss": 0.5637917518615723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.984898860193694e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14602301.0, + "repeat_count": 0.0, + "routers_loss": 0.003135781968012452, + "skip_count": 0.0, + "step": 9054, + "text_loss": 0.345111608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 5.9702234071631e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14606625.0, + "repeat_count": 0.0, + "routers_loss": 0.002299862913787365, + "skip_count": 0.0, + "step": 9056, + "text_loss": 0.30707255005836487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 5.9555648264259576e-05, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 14610303.0, + "repeat_count": 1.0, + "routers_loss": 0.0007164468406699598, + "skip_count": 0.0, + "step": 9058, + "text_loss": 0.56083083152771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 5.940923123599462e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14613211.0, + "repeat_count": 0.0, + "routers_loss": 0.00136603566352278, + "skip_count": 0.0, + "step": 9060, + "text_loss": 0.4455239474773407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 5.926298304294336e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14615844.0, + "repeat_count": 0.0, + "routers_loss": 0.001727075781673193, + "skip_count": 0.0, + "step": 9062, + "text_loss": 0.5928102731704712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 5.911690374114842e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 14619190.0, + "repeat_count": 0.0, + "routers_loss": 0.0022300337441265583, + "skip_count": 0.0, + "step": 9064, + "text_loss": 0.9456163048744202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 5.8970993386587676e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 14622304.0, + "repeat_count": 0.0, + "routers_loss": 0.006507525686174631, + "skip_count": 2.0, + "step": 9066, + "text_loss": 0.1809750199317932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 5.882525203517419e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14625386.0, + "repeat_count": 0.0, + "routers_loss": 0.0022866397630423307, + "skip_count": 0.0, + "step": 9068, + "text_loss": 0.1849939227104187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 5.867967974275629e-05, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 14628472.0, + "repeat_count": 1.0, + "routers_loss": 0.0058460538275539875, + "skip_count": 2.0, + "step": 9070, + "text_loss": 0.2627561688423157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 5.853427656511773e-05, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 14631187.0, + "repeat_count": 1.0, + "routers_loss": 0.0085217310115695, + "skip_count": 2.0, + "step": 9072, + "text_loss": 0.18039973080158234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 5.838904255797717e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 14633919.0, + "repeat_count": 1.0, + "routers_loss": 0.007423012051731348, + "skip_count": 4.0, + "step": 9074, + "text_loss": 0.23746201395988464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 5.8243977776988585e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14636674.0, + "repeat_count": 0.0, + "routers_loss": 0.0011181328445672989, + "skip_count": 0.0, + "step": 9076, + "text_loss": 0.38140806555747986 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.619900205459345, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.8099082277741024e-05, + "loss": 0.0052, + "macro_f1": 0.9262410998344421, + "num_tokens": 14639506.0, + "repeat_count": 3.0, + "routers_loss": 0.03306882083415985, + "skip_count": 2.0, + "step": 9078, + "text_loss": 0.2627770006656647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 5.795435611575872e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14642955.0, + "repeat_count": 0.0, + "routers_loss": 0.0014759303303435445, + "skip_count": 0.0, + "step": 9080, + "text_loss": 0.47112786769866943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 5.78097993465011e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14646018.0, + "repeat_count": 0.0, + "routers_loss": 0.003744201036170125, + "skip_count": 0.0, + "step": 9082, + "text_loss": 0.36873605847358704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 5.7665412025362516e-05, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 14649402.0, + "repeat_count": 0.0, + "routers_loss": 0.002992798574268818, + "skip_count": 2.0, + "step": 9084, + "text_loss": 0.6350628137588501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 5.752119420767243e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14652248.0, + "repeat_count": 0.0, + "routers_loss": 0.005798593629151583, + "skip_count": 2.0, + "step": 9086, + "text_loss": 0.2512637972831726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 5.7377145948695474e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 14655060.0, + "repeat_count": 0.0, + "routers_loss": 0.0024162146728485823, + "skip_count": 0.0, + "step": 9088, + "text_loss": 0.4233066439628601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.723326730363115e-05, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 14658873.0, + "repeat_count": 1.0, + "routers_loss": 0.004826475866138935, + "skip_count": 4.0, + "step": 9090, + "text_loss": 0.45946353673934937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 5.7089558327614036e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 14661865.0, + "repeat_count": 0.0, + "routers_loss": 0.0020765739027410746, + "skip_count": 2.0, + "step": 9092, + "text_loss": 0.9425542950630188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 5.694601907571356e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14666085.0, + "repeat_count": 0.0, + "routers_loss": 0.0012533976696431637, + "skip_count": 0.0, + "step": 9094, + "text_loss": 0.6307007670402527 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 5.680264960293446e-05, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 14668992.0, + "repeat_count": 1.0, + "routers_loss": 0.013796845450997353, + "skip_count": 5.0, + "step": 9096, + "text_loss": 0.21720129251480103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 5.665944996421612e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 14672365.0, + "repeat_count": 0.0, + "routers_loss": 0.004391494672745466, + "skip_count": 0.0, + "step": 9098, + "text_loss": 0.28794240951538086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 5.651642021443287e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14676232.0, + "repeat_count": 0.0, + "routers_loss": 0.0006779583054594696, + "skip_count": 0.0, + "step": 9100, + "text_loss": 0.45190441608428955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 42.73260933372468, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0213623046875, + "learning_rate": 5.637356040839398e-05, + "loss": 0.0049, + "macro_f1": 0.6289562582969666, + "num_tokens": 14679582.0, + "repeat_count": 0.0, + "routers_loss": 0.02379363216459751, + "skip_count": 6.0, + "step": 9102, + "text_loss": 0.3395652770996094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 5.623087060084364e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 14683438.0, + "repeat_count": 0.0, + "routers_loss": 0.00344930961728096, + "skip_count": 4.0, + "step": 9104, + "text_loss": 0.4345538914203644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 5.60883508464608e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 14686333.0, + "repeat_count": 0.0, + "routers_loss": 0.005554547533392906, + "skip_count": 3.0, + "step": 9106, + "text_loss": 0.5202528238296509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 5.594600119985932e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14690754.0, + "repeat_count": 0.0, + "routers_loss": 0.004589532967656851, + "skip_count": 1.0, + "step": 9108, + "text_loss": 0.3040390610694885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.77017904314646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 5.580382171558784e-05, + "loss": 0.0055, + "macro_f1": 0.32098764181137085, + "num_tokens": 14693793.0, + "repeat_count": 0.0, + "routers_loss": 0.029969461262226105, + "skip_count": 2.0, + "step": 9110, + "text_loss": 0.3644331693649292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 5.566181244812979e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14697290.0, + "repeat_count": 0.0, + "routers_loss": 0.003387648146599531, + "skip_count": 0.0, + "step": 9112, + "text_loss": 0.5177932977676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 5.5519973451903404e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14700597.0, + "repeat_count": 0.0, + "routers_loss": 0.004790942650288343, + "skip_count": 1.0, + "step": 9114, + "text_loss": 0.2132686972618103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 5.5378304781261715e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14703852.0, + "repeat_count": 0.0, + "routers_loss": 0.0007685191812925041, + "skip_count": 0.0, + "step": 9116, + "text_loss": 0.6690551042556763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 5.523680649049234e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14707218.0, + "repeat_count": 1.0, + "routers_loss": 0.0033531817607581615, + "skip_count": 0.0, + "step": 9118, + "text_loss": 0.26232191920280457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.81714117992369, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 5.509547863381781e-05, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 14710244.0, + "repeat_count": 1.0, + "routers_loss": 0.025616342201828957, + "skip_count": 0.0, + "step": 9120, + "text_loss": 0.2897983193397522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 5.495432126539507e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14713495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014400121290236712, + "skip_count": 0.0, + "step": 9122, + "text_loss": 0.4580271244049072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 5.481333443931602e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14716703.0, + "repeat_count": 0.0, + "routers_loss": 0.0008548611658625305, + "skip_count": 0.0, + "step": 9124, + "text_loss": 0.5140601992607117 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.84531846199002, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 5.4672518209607e-05, + "loss": 0.0075, + "macro_f1": 0.9255813956260681, + "num_tokens": 14719443.0, + "repeat_count": 3.0, + "routers_loss": 0.02092800848186016, + "skip_count": 4.0, + "step": 9126, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 5.4531872630228965e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14722711.0, + "repeat_count": 0.0, + "routers_loss": 0.0037711653858423233, + "skip_count": 0.0, + "step": 9128, + "text_loss": 0.3268158733844757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 5.4391397755077784e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 14725635.0, + "repeat_count": 0.0, + "routers_loss": 0.005959369707852602, + "skip_count": 0.0, + "step": 9130, + "text_loss": 0.44725099205970764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0181884765625, + "learning_rate": 5.425109363798358e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14728945.0, + "repeat_count": 0.0, + "routers_loss": 0.0011272960109636188, + "skip_count": 0.0, + "step": 9132, + "text_loss": 0.45580998063087463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 5.411096033271118e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14732271.0, + "repeat_count": 0.0, + "routers_loss": 0.0015554855344817042, + "skip_count": 0.0, + "step": 9134, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 5.3970997892959894e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 14735462.0, + "repeat_count": 4.0, + "routers_loss": 0.007287262007594109, + "skip_count": 5.0, + "step": 9136, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 5.383120637236366e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 14739288.0, + "repeat_count": 0.0, + "routers_loss": 0.004336730111390352, + "skip_count": 0.0, + "step": 9138, + "text_loss": 0.29503148794174194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 5.369158582449074e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 14742058.0, + "repeat_count": 0.0, + "routers_loss": 0.004528806544840336, + "skip_count": 0.0, + "step": 9140, + "text_loss": 0.16937516629695892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 5.3552136302844e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 14745628.0, + "repeat_count": 0.0, + "routers_loss": 0.0005676734144799411, + "skip_count": 0.0, + "step": 9142, + "text_loss": 0.48764488101005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.3412857860860917e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14748482.0, + "repeat_count": 0.0, + "routers_loss": 0.0017468055011704564, + "skip_count": 0.0, + "step": 9144, + "text_loss": 0.46164339780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.93924273554447, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 5.327375055191314e-05, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 14751091.0, + "repeat_count": 0.0, + "routers_loss": 0.007167307659983635, + "skip_count": 1.0, + "step": 9146, + "text_loss": 0.37566086649894714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 5.3134814429306896e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14753850.0, + "repeat_count": 0.0, + "routers_loss": 0.003801940008997917, + "skip_count": 2.0, + "step": 9148, + "text_loss": 0.17589576542377472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 5.299604954628268e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14756779.0, + "repeat_count": 0.0, + "routers_loss": 0.00396628538146615, + "skip_count": 1.0, + "step": 9150, + "text_loss": 0.4118746817111969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 42.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 5.2857455956015544e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14759574.0, + "repeat_count": 2.0, + "routers_loss": 0.003950111567974091, + "skip_count": 0.0, + "step": 9152, + "text_loss": 0.5839328169822693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 5.271903371161479e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14762802.0, + "repeat_count": 0.0, + "routers_loss": 0.0006622051005251706, + "skip_count": 1.0, + "step": 9154, + "text_loss": 0.40162989497184753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 5.2580782866124054e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14766136.0, + "repeat_count": 0.0, + "routers_loss": 0.003140404587611556, + "skip_count": 0.0, + "step": 9156, + "text_loss": 0.2028028815984726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 5.244270347252139e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 14769306.0, + "repeat_count": 0.0, + "routers_loss": 0.0035792726557701826, + "skip_count": 1.0, + "step": 9158, + "text_loss": 0.5611430406570435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 5.2304795583719034e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14771928.0, + "repeat_count": 0.0, + "routers_loss": 0.007276696152985096, + "skip_count": 2.0, + "step": 9160, + "text_loss": 0.1382172554731369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 5.2167059252563485e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14775047.0, + "repeat_count": 0.0, + "routers_loss": 0.003121814923360944, + "skip_count": 0.0, + "step": 9162, + "text_loss": 0.6130381226539612 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 43.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 5.2029494531835695e-05, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 14777746.0, + "repeat_count": 4.0, + "routers_loss": 0.006029475014656782, + "skip_count": 1.0, + "step": 9164, + "text_loss": 0.5901363492012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 43.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 5.189210147425061e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14780813.0, + "repeat_count": 0.0, + "routers_loss": 0.0034428017679601908, + "skip_count": 5.0, + "step": 9166, + "text_loss": 0.5909968018531799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 5.1754880132457494e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14785178.0, + "repeat_count": 0.0, + "routers_loss": 0.0025068193208426237, + "skip_count": 2.0, + "step": 9168, + "text_loss": 0.20257101953029633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 5.161783055904001e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 14788307.0, + "repeat_count": 0.0, + "routers_loss": 0.003352245781570673, + "skip_count": 0.0, + "step": 9170, + "text_loss": 0.20024186372756958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 5.1480952806515654e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14791053.0, + "repeat_count": 1.0, + "routers_loss": 0.0009423785959370434, + "skip_count": 0.0, + "step": 9172, + "text_loss": 0.6944412589073181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 5.13442469273363e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14794259.0, + "repeat_count": 0.0, + "routers_loss": 0.0016676477389410138, + "skip_count": 0.0, + "step": 9174, + "text_loss": 0.10889370739459991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 5.1207712973887875e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14797345.0, + "repeat_count": 0.0, + "routers_loss": 0.005842766724526882, + "skip_count": 2.0, + "step": 9176, + "text_loss": 0.17763052880764008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 5.107135099849042e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14800819.0, + "repeat_count": 0.0, + "routers_loss": 0.0004951528972014785, + "skip_count": 0.0, + "step": 9178, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 5.093516105339818e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 14803924.0, + "repeat_count": 0.0, + "routers_loss": 0.0031010014936327934, + "skip_count": 1.0, + "step": 9180, + "text_loss": 0.39177098870277405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 5.079914319079931e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14807083.0, + "repeat_count": 0.0, + "routers_loss": 0.00047361713950522244, + "skip_count": 0.0, + "step": 9182, + "text_loss": 0.39144888520240784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 5.066329746281617e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14810263.0, + "repeat_count": 0.0, + "routers_loss": 0.0018734827172011137, + "skip_count": 0.0, + "step": 9184, + "text_loss": 0.531446099281311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 5.052762392150506e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 14813761.0, + "repeat_count": 0.0, + "routers_loss": 0.00503428652882576, + "skip_count": 0.0, + "step": 9186, + "text_loss": 0.19398775696754456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 5.039212261885634e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14817708.0, + "repeat_count": 0.0, + "routers_loss": 0.0010842647170647979, + "skip_count": 0.0, + "step": 9188, + "text_loss": 0.5365647077560425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 5.025679360679442e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 14820912.0, + "repeat_count": 2.0, + "routers_loss": 0.004775309935212135, + "skip_count": 2.0, + "step": 9190, + "text_loss": 0.6473321318626404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 5.012163693717747e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 14824115.0, + "repeat_count": 0.0, + "routers_loss": 0.004022061824798584, + "skip_count": 0.0, + "step": 9192, + "text_loss": 0.24432586133480072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 4.9986652661798025e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14827404.0, + "repeat_count": 0.0, + "routers_loss": 0.00231996551156044, + "skip_count": 1.0, + "step": 9194, + "text_loss": 0.7459486722946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 4.98518408323822e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 14830077.0, + "repeat_count": 0.0, + "routers_loss": 0.000999651150777936, + "skip_count": 0.0, + "step": 9196, + "text_loss": 0.5136345624923706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 4.971720150059012e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14833231.0, + "repeat_count": 0.0, + "routers_loss": 0.0033226648811250925, + "skip_count": 2.0, + "step": 9198, + "text_loss": 0.1597593128681183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 4.958273471801583e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 14836534.0, + "repeat_count": 0.0, + "routers_loss": 0.00400200579315424, + "skip_count": 0.0, + "step": 9200, + "text_loss": 0.16248664259910583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 4.94484405361873e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14840301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038636941462755203, + "skip_count": 0.0, + "step": 9202, + "text_loss": 0.20964740216732025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 4.9314319006566296e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14844094.0, + "repeat_count": 0.0, + "routers_loss": 0.00593461561948061, + "skip_count": 2.0, + "step": 9204, + "text_loss": 0.43311986327171326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 4.918037018054844e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14847148.0, + "repeat_count": 0.0, + "routers_loss": 0.0007939442875795066, + "skip_count": 0.0, + "step": 9206, + "text_loss": 0.8805840015411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 4.904659410946311e-05, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 14851556.0, + "repeat_count": 2.0, + "routers_loss": 0.0058822291903197765, + "skip_count": 4.0, + "step": 9208, + "text_loss": 0.2123873233795166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 4.891299084457362e-05, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 14855208.0, + "repeat_count": 0.0, + "routers_loss": 0.0024413811042904854, + "skip_count": 0.0, + "step": 9210, + "text_loss": 0.4408712685108185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 4.8779560437076983e-05, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 14858433.0, + "repeat_count": 0.0, + "routers_loss": 0.007487752009183168, + "skip_count": 1.0, + "step": 9212, + "text_loss": 0.7417129874229431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 4.864630293810401e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 14861739.0, + "repeat_count": 0.0, + "routers_loss": 0.007972145453095436, + "skip_count": 2.0, + "step": 9214, + "text_loss": 0.3347324728965759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 4.851321839871908e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14865220.0, + "repeat_count": 0.0, + "routers_loss": 0.006238576490432024, + "skip_count": 1.0, + "step": 9216, + "text_loss": 0.49660998582839966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 4.838030686992062e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14868179.0, + "repeat_count": 0.0, + "routers_loss": 0.003592922119423747, + "skip_count": 0.0, + "step": 9218, + "text_loss": 0.316535621881485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 4.824756840264055e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14870950.0, + "repeat_count": 0.0, + "routers_loss": 0.012321153655648232, + "skip_count": 3.0, + "step": 9220, + "text_loss": 0.270915150642395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 4.8115003047744466e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14873749.0, + "repeat_count": 0.0, + "routers_loss": 0.0008396002231165767, + "skip_count": 0.0, + "step": 9222, + "text_loss": 0.4190096855163574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 4.798261085603162e-05, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 14877349.0, + "repeat_count": 0.0, + "routers_loss": 0.002983161248266697, + "skip_count": 1.0, + "step": 9224, + "text_loss": 0.8203139901161194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 4.785039187823503e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14881192.0, + "repeat_count": 0.0, + "routers_loss": 0.003951616585254669, + "skip_count": 2.0, + "step": 9226, + "text_loss": 0.36447709798812866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 4.771834616502119e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 14884608.0, + "repeat_count": 0.0, + "routers_loss": 0.001604852732270956, + "skip_count": 0.0, + "step": 9228, + "text_loss": 0.733951985836029 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.333431171118285, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 4.758647376699032e-05, + "loss": 0.0053, + "macro_f1": 0.8820862174034119, + "num_tokens": 14887963.0, + "repeat_count": 2.0, + "routers_loss": 0.041028670966625214, + "skip_count": 2.0, + "step": 9230, + "text_loss": 0.1800784021615982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 4.7454774734676074e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14890769.0, + "repeat_count": 0.0, + "routers_loss": 0.0027380166575312614, + "skip_count": 0.0, + "step": 9232, + "text_loss": 0.6017972230911255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.732324911854591e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 14894162.0, + "repeat_count": 0.0, + "routers_loss": 0.0018064725445583463, + "skip_count": 2.0, + "step": 9234, + "text_loss": 0.5853637456893921 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 4.7191896969000617e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14897248.0, + "repeat_count": 1.0, + "routers_loss": 0.005479716695845127, + "skip_count": 0.0, + "step": 9236, + "text_loss": 0.6206526756286621 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.371000880540066, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 4.706071833637454e-05, + "loss": 0.0059, + "macro_f1": 0.9446290731430054, + "num_tokens": 14900186.0, + "repeat_count": 4.0, + "routers_loss": 0.013435420580208302, + "skip_count": 3.0, + "step": 9238, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 4.692971327093559e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 14903080.0, + "repeat_count": 1.0, + "routers_loss": 0.007366253528743982, + "skip_count": 4.0, + "step": 9240, + "text_loss": 0.6870771646499634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 4.6798881822885276e-05, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 14906837.0, + "repeat_count": 1.0, + "routers_loss": 0.004979560151696205, + "skip_count": 2.0, + "step": 9242, + "text_loss": 0.46396589279174805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 4.666822404235838e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 14909541.0, + "repeat_count": 0.0, + "routers_loss": 0.00023516178771387786, + "skip_count": 0.0, + "step": 9244, + "text_loss": 0.5960518717765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 4.6537739979423174e-05, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 14912820.0, + "repeat_count": 1.0, + "routers_loss": 0.0014796241885051131, + "skip_count": 1.0, + "step": 9246, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 4.640742968408146e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 14916283.0, + "repeat_count": 0.0, + "routers_loss": 0.001386807532981038, + "skip_count": 0.0, + "step": 9248, + "text_loss": 0.3950015902519226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 43.427355444672735, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.037109375, + "learning_rate": 4.627729320626833e-05, + "loss": 0.0061, + "macro_f1": 0.9452888369560242, + "num_tokens": 14918958.0, + "repeat_count": 1.0, + "routers_loss": 0.020335515961050987, + "skip_count": 4.0, + "step": 9250, + "text_loss": 0.6995832324028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 4.6147330595852354e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14921888.0, + "repeat_count": 0.0, + "routers_loss": 0.005387732293456793, + "skip_count": 2.0, + "step": 9252, + "text_loss": 0.2771800756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 4.601754190263552e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14925135.0, + "repeat_count": 0.0, + "routers_loss": 0.001703745685517788, + "skip_count": 1.0, + "step": 9254, + "text_loss": 0.7100088596343994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 4.5887927176352875e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14929198.0, + "repeat_count": 0.0, + "routers_loss": 0.0058114733546972275, + "skip_count": 2.0, + "step": 9256, + "text_loss": 0.21729083359241486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 4.5758486466673244e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 14932685.0, + "repeat_count": 0.0, + "routers_loss": 0.0026105218566954136, + "skip_count": 0.0, + "step": 9258, + "text_loss": 0.20695121586322784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 4.5629219823198564e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14937901.0, + "repeat_count": 0.0, + "routers_loss": 0.006947176996618509, + "skip_count": 2.0, + "step": 9260, + "text_loss": 0.15886647999286652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 4.550012729546393e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14941406.0, + "repeat_count": 0.0, + "routers_loss": 0.0011366386897861958, + "skip_count": 0.0, + "step": 9262, + "text_loss": 0.49892309308052063 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 4.537120893293789e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 14944200.0, + "repeat_count": 1.0, + "routers_loss": 0.002686526160687208, + "skip_count": 1.0, + "step": 9264, + "text_loss": 0.6201852560043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 4.5242464785022256e-05, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 14947592.0, + "repeat_count": 0.0, + "routers_loss": 0.0007816873257979751, + "skip_count": 0.0, + "step": 9266, + "text_loss": 0.49434536695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 4.5113894901051944e-05, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 14950382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013167982688173652, + "skip_count": 0.0, + "step": 9268, + "text_loss": 0.696306586265564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.498549933029511e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 14953424.0, + "repeat_count": 0.0, + "routers_loss": 0.006240467075258493, + "skip_count": 3.0, + "step": 9270, + "text_loss": 0.14193731546401978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 4.485727812195339e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14956937.0, + "repeat_count": 0.0, + "routers_loss": 0.006212725769728422, + "skip_count": 2.0, + "step": 9272, + "text_loss": 0.40858668088912964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 4.472923132516132e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14960398.0, + "repeat_count": 0.0, + "routers_loss": 0.003120801877230406, + "skip_count": 2.0, + "step": 9274, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 4.46013589889866e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14963037.0, + "repeat_count": 0.0, + "routers_loss": 0.0027343074325472116, + "skip_count": 0.0, + "step": 9276, + "text_loss": 0.1420614868402481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 4.4473661162430176e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14965604.0, + "repeat_count": 0.0, + "routers_loss": 0.0006372901843860745, + "skip_count": 0.0, + "step": 9278, + "text_loss": 0.4628531336784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 4.4346137894426155e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14968803.0, + "repeat_count": 0.0, + "routers_loss": 0.0062922025099396706, + "skip_count": 2.0, + "step": 9280, + "text_loss": 0.29813849925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 4.421878923384159e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 14972557.0, + "repeat_count": 0.0, + "routers_loss": 0.006071912590414286, + "skip_count": 2.0, + "step": 9282, + "text_loss": 0.19581027328968048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 4.40916152294768e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14975358.0, + "repeat_count": 1.0, + "routers_loss": 0.001606325968168676, + "skip_count": 0.0, + "step": 9284, + "text_loss": 0.6929896473884583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 4.3964615930065124e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 14978045.0, + "repeat_count": 0.0, + "routers_loss": 0.002845643786713481, + "skip_count": 1.0, + "step": 9286, + "text_loss": 0.49997636675834656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 4.3837791384272744e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 14981606.0, + "repeat_count": 0.0, + "routers_loss": 0.005257320590317249, + "skip_count": 1.0, + "step": 9288, + "text_loss": 0.3391074538230896 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.61520399178163, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 4.3711141640699395e-05, + "loss": 0.0045, + "macro_f1": 0.8820862174034119, + "num_tokens": 14984404.0, + "repeat_count": 2.0, + "routers_loss": 0.02914038859307766, + "skip_count": 2.0, + "step": 9290, + "text_loss": 0.29165980219841003 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 4.3584666747877254e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14987280.0, + "repeat_count": 0.0, + "routers_loss": 0.005831835325807333, + "skip_count": 1.0, + "step": 9292, + "text_loss": 0.5312305688858032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 4.345836675427184e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 14990071.0, + "repeat_count": 0.0, + "routers_loss": 0.0035566375590860844, + "skip_count": 0.0, + "step": 9294, + "text_loss": 0.25595441460609436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 4.333224170828149e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 14993809.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552488561719656, + "skip_count": 0.0, + "step": 9296, + "text_loss": 0.18538808822631836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 43.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 4.3206291658237586e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14996794.0, + "repeat_count": 0.0, + "routers_loss": 0.010047328658401966, + "skip_count": 4.0, + "step": 9298, + "text_loss": 0.37891554832458496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 4.308051665240442e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15000911.0, + "repeat_count": 0.0, + "routers_loss": 0.0030308531131595373, + "skip_count": 0.0, + "step": 9300, + "text_loss": 0.20204831659793854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 4.295491673897922e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15004106.0, + "repeat_count": 0.0, + "routers_loss": 0.003695673542097211, + "skip_count": 1.0, + "step": 9302, + "text_loss": 0.84013831615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 4.282949196609215e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15007482.0, + "repeat_count": 0.0, + "routers_loss": 0.000820459274109453, + "skip_count": 0.0, + "step": 9304, + "text_loss": 0.4521652162075043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 4.2704242381806144e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 15010579.0, + "repeat_count": 0.0, + "routers_loss": 0.006170184817165136, + "skip_count": 1.0, + "step": 9306, + "text_loss": 0.22438007593154907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 43.699735837980626, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.051025390625, + "learning_rate": 4.25791680341171e-05, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 15013835.0, + "repeat_count": 0.0, + "routers_loss": 0.021745599806308746, + "skip_count": 4.0, + "step": 9308, + "text_loss": 0.5847432613372803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 4.245426897095372e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15017268.0, + "repeat_count": 0.0, + "routers_loss": 0.0022570823784917593, + "skip_count": 1.0, + "step": 9310, + "text_loss": 0.345931738615036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 4.232954524017763e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15020095.0, + "repeat_count": 0.0, + "routers_loss": 0.0009895693510770798, + "skip_count": 0.0, + "step": 9312, + "text_loss": 0.5374923944473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 4.220499688958307e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15022763.0, + "repeat_count": 0.0, + "routers_loss": 0.005146807990968227, + "skip_count": 0.0, + "step": 9314, + "text_loss": 0.7208939790725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 4.208062396689738e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 15025926.0, + "repeat_count": 0.0, + "routers_loss": 0.00369556387886405, + "skip_count": 1.0, + "step": 9316, + "text_loss": 0.36686572432518005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 4.1956426519780435e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15029120.0, + "repeat_count": 0.0, + "routers_loss": 0.00971714872866869, + "skip_count": 2.0, + "step": 9318, + "text_loss": 0.20697914063930511 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 4.183240459582488e-05, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 15032000.0, + "repeat_count": 1.0, + "routers_loss": 0.002361048012971878, + "skip_count": 1.0, + "step": 9320, + "text_loss": 0.6737313866615295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 4.1708558242556207e-05, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 15034831.0, + "repeat_count": 0.0, + "routers_loss": 0.001238204538822174, + "skip_count": 0.0, + "step": 9322, + "text_loss": 0.823642373085022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 4.1584887507432556e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15037487.0, + "repeat_count": 0.0, + "routers_loss": 0.005211949814110994, + "skip_count": 1.0, + "step": 9324, + "text_loss": 0.3821350634098053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 4.146139243784475e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15040167.0, + "repeat_count": 0.0, + "routers_loss": 0.007513152435421944, + "skip_count": 0.0, + "step": 9326, + "text_loss": 0.18124167621135712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 4.133807308111637e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15043777.0, + "repeat_count": 0.0, + "routers_loss": 0.0029832208529114723, + "skip_count": 0.0, + "step": 9328, + "text_loss": 0.47313618659973145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 4.1214929484503615e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15046622.0, + "repeat_count": 0.0, + "routers_loss": 0.009155526757240295, + "skip_count": 1.0, + "step": 9330, + "text_loss": 0.20556017756462097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 4.1091961695195304e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15049543.0, + "repeat_count": 0.0, + "routers_loss": 0.003529169363901019, + "skip_count": 0.0, + "step": 9332, + "text_loss": 0.18752245604991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 4.0969169760313005e-05, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 15052924.0, + "repeat_count": 1.0, + "routers_loss": 0.002136822324246168, + "skip_count": 2.0, + "step": 9334, + "text_loss": 0.85563725233078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 4.084655372691076e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 15056579.0, + "repeat_count": 0.0, + "routers_loss": 0.003167972667142749, + "skip_count": 2.0, + "step": 9336, + "text_loss": 0.45709627866744995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 43.8406222483123, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0240478515625, + "learning_rate": 4.07241136419752e-05, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 15059739.0, + "repeat_count": 0.0, + "routers_loss": 0.03742539510130882, + "skip_count": 2.0, + "step": 9338, + "text_loss": 0.19531641900539398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 4.06018495524258e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15062795.0, + "repeat_count": 0.0, + "routers_loss": 0.002699678996577859, + "skip_count": 0.0, + "step": 9340, + "text_loss": 0.31032654643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 4.047976150511423e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15066591.0, + "repeat_count": 0.0, + "routers_loss": 0.0026099481619894505, + "skip_count": 0.0, + "step": 9342, + "text_loss": 0.4676157832145691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 4.035784954682486e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15069509.0, + "repeat_count": 0.0, + "routers_loss": 0.006772278342396021, + "skip_count": 1.0, + "step": 9344, + "text_loss": 0.23385995626449585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.0236113724274713e-05, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 15072898.0, + "repeat_count": 1.0, + "routers_loss": 0.0005968905170448124, + "skip_count": 0.0, + "step": 9346, + "text_loss": 0.6250094175338745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 4.011455408411302e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15075547.0, + "repeat_count": 0.0, + "routers_loss": 0.012884319759905338, + "skip_count": 2.0, + "step": 9348, + "text_loss": 0.23720405995845795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 3.9993170672921794e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15078902.0, + "repeat_count": 0.0, + "routers_loss": 0.0018171088304370642, + "skip_count": 0.0, + "step": 9350, + "text_loss": 0.23975110054016113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 3.9871963537215284e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 15082292.0, + "repeat_count": 1.0, + "routers_loss": 0.001974726328626275, + "skip_count": 1.0, + "step": 9352, + "text_loss": 0.354034423828125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 3.975093272344038e-05, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 15085288.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760299818590283, + "skip_count": 0.0, + "step": 9354, + "text_loss": 0.6398947834968567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 3.963007827797627e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15089089.0, + "repeat_count": 0.0, + "routers_loss": 0.004467889666557312, + "skip_count": 3.0, + "step": 9356, + "text_loss": 0.26422595977783203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 3.950940024713462e-05, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 15092178.0, + "repeat_count": 0.0, + "routers_loss": 0.0048953029327094555, + "skip_count": 1.0, + "step": 9358, + "text_loss": 0.7519236207008362 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 3.9388898677159446e-05, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 15094825.0, + "repeat_count": 1.0, + "routers_loss": 0.004229324869811535, + "skip_count": 1.0, + "step": 9360, + "text_loss": 0.522379457950592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 3.9268573614227146e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15098119.0, + "repeat_count": 0.0, + "routers_loss": 0.0028480603359639645, + "skip_count": 3.0, + "step": 9362, + "text_loss": 0.47443902492523193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 3.914842510444666e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15101362.0, + "repeat_count": 0.0, + "routers_loss": 0.0024998984299600124, + "skip_count": 1.0, + "step": 9364, + "text_loss": 0.6255060434341431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 3.9028453193859006e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15104544.0, + "repeat_count": 0.0, + "routers_loss": 0.008692052215337753, + "skip_count": 1.0, + "step": 9366, + "text_loss": 0.26974618434906006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 3.890865792843768e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 15107619.0, + "repeat_count": 0.0, + "routers_loss": 0.002779777627438307, + "skip_count": 2.0, + "step": 9368, + "text_loss": 0.4157184064388275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 3.878903935408845e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15111352.0, + "repeat_count": 0.0, + "routers_loss": 0.0010220289696007967, + "skip_count": 0.0, + "step": 9370, + "text_loss": 0.5674155950546265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 3.866959751664939e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15114088.0, + "repeat_count": 0.0, + "routers_loss": 0.004387985449284315, + "skip_count": 1.0, + "step": 9372, + "text_loss": 0.3638002276420593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 3.8550332461890824e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15117271.0, + "repeat_count": 0.0, + "routers_loss": 0.0005855522467754781, + "skip_count": 0.0, + "step": 9374, + "text_loss": 0.6257871389389038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.843124423551536e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15119936.0, + "repeat_count": 0.0, + "routers_loss": 0.0026496360078454018, + "skip_count": 0.0, + "step": 9376, + "text_loss": 0.7019506096839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 3.8312332883157774e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15123407.0, + "repeat_count": 0.0, + "routers_loss": 0.0024072150699794292, + "skip_count": 0.0, + "step": 9378, + "text_loss": 0.45380696654319763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 3.819359845038517e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15126742.0, + "repeat_count": 0.0, + "routers_loss": 0.00031929166289046407, + "skip_count": 0.0, + "step": 9380, + "text_loss": 0.5322204828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 3.807504098269682e-05, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 15130854.0, + "repeat_count": 0.0, + "routers_loss": 0.00177620945032686, + "skip_count": 0.0, + "step": 9382, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 44.05635456413267, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.02783203125, + "learning_rate": 3.7956660525524156e-05, + "loss": 0.0071, + "macro_f1": 0.8823530077934265, + "num_tokens": 15135054.0, + "repeat_count": 1.0, + "routers_loss": 0.013358182273805141, + "skip_count": 2.0, + "step": 9384, + "text_loss": 0.39796701073646545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 3.783845712423067e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15139179.0, + "repeat_count": 0.0, + "routers_loss": 0.0030253338627517223, + "skip_count": 0.0, + "step": 9386, + "text_loss": 0.13592341542243958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 3.772043082411236e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15142436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311813580803573, + "skip_count": 0.0, + "step": 9388, + "text_loss": 0.7804215550422668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 3.760258167039704e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15146071.0, + "repeat_count": 0.0, + "routers_loss": 0.012432600371539593, + "skip_count": 1.0, + "step": 9390, + "text_loss": 0.37692421674728394 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8571428656578064, + "avg_layers": 23.0, + "epoch": 44.09392427355445, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9230769276618958, + "grad_norm": 0.053955078125, + "learning_rate": 3.748490970824464e-05, + "loss": 0.0074, + "macro_f1": 0.9662289023399353, + "num_tokens": 15149020.0, + "repeat_count": 1.0, + "routers_loss": 0.03158312290906906, + "skip_count": 7.0, + "step": 9392, + "text_loss": 0.6111845374107361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 3.7367414982747374e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15151887.0, + "repeat_count": 0.0, + "routers_loss": 0.000898235070053488, + "skip_count": 0.0, + "step": 9394, + "text_loss": 0.42988476157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 3.7250097538929384e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15155395.0, + "repeat_count": 0.0, + "routers_loss": 0.0024584042839705944, + "skip_count": 1.0, + "step": 9396, + "text_loss": 0.4083070456981659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 3.713295742174694e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15158275.0, + "repeat_count": 0.0, + "routers_loss": 0.0012269694125279784, + "skip_count": 0.0, + "step": 9398, + "text_loss": 0.529385507106781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 3.701599467608835e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15161533.0, + "repeat_count": 0.0, + "routers_loss": 0.002610012423247099, + "skip_count": 1.0, + "step": 9400, + "text_loss": 0.1785552203655243 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 3.6899209346773986e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 15164799.0, + "repeat_count": 1.0, + "routers_loss": 0.0012146600056439638, + "skip_count": 0.0, + "step": 9402, + "text_loss": 0.9209059476852417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.678260147855628e-05, + "loss": 0.0028, + "macro_f1": 0.6666666865348816, + "num_tokens": 15168111.0, + "repeat_count": 0.0, + "routers_loss": 0.001716976286843419, + "skip_count": 1.0, + "step": 9404, + "text_loss": 0.5762659907341003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 3.6666171116119474e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15171285.0, + "repeat_count": 1.0, + "routers_loss": 0.005656248424202204, + "skip_count": 2.0, + "step": 9406, + "text_loss": 0.3065127432346344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 3.6549918304079946e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15174838.0, + "repeat_count": 0.0, + "routers_loss": 0.002362997969612479, + "skip_count": 2.0, + "step": 9408, + "text_loss": 0.5256759524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 3.643384308698594e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15177713.0, + "repeat_count": 0.0, + "routers_loss": 0.002327109221369028, + "skip_count": 1.0, + "step": 9410, + "text_loss": 0.27613985538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 3.6317945509317716e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15180863.0, + "repeat_count": 1.0, + "routers_loss": 0.008501979522407055, + "skip_count": 0.0, + "step": 9412, + "text_loss": 0.3379829525947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 3.6202225615487525e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15184531.0, + "repeat_count": 0.0, + "routers_loss": 0.004115676507353783, + "skip_count": 0.0, + "step": 9414, + "text_loss": 0.24313601851463318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 3.6086683449839454e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15187699.0, + "repeat_count": 0.0, + "routers_loss": 0.0017425924306735396, + "skip_count": 0.0, + "step": 9416, + "text_loss": 0.47485142946243286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 44.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 3.597131905664935e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 15190528.0, + "repeat_count": 1.0, + "routers_loss": 0.0031498887110501528, + "skip_count": 1.0, + "step": 9418, + "text_loss": 0.5356660485267639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 3.585613248012515e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15194165.0, + "repeat_count": 0.0, + "routers_loss": 0.006833057850599289, + "skip_count": 1.0, + "step": 9420, + "text_loss": 0.21593274176120758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 3.574112376440658e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15197612.0, + "repeat_count": 0.0, + "routers_loss": 0.0013788710348308086, + "skip_count": 1.0, + "step": 9422, + "text_loss": 0.5275097489356995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 3.5626292953565175e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15201103.0, + "repeat_count": 0.0, + "routers_loss": 0.0021296890918165445, + "skip_count": 0.0, + "step": 9424, + "text_loss": 0.3420610725879669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 3.551164009160429e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15204007.0, + "repeat_count": 0.0, + "routers_loss": 0.0025281559210270643, + "skip_count": 0.0, + "step": 9426, + "text_loss": 0.4756413996219635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 3.539716522245917e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15208066.0, + "repeat_count": 0.0, + "routers_loss": 0.0008577071712352335, + "skip_count": 0.0, + "step": 9428, + "text_loss": 0.7672523260116577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 3.528286838999672e-05, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 15211118.0, + "repeat_count": 1.0, + "routers_loss": 0.002977409167215228, + "skip_count": 0.0, + "step": 9430, + "text_loss": 0.5010796785354614 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 3.5168749638015806e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 15214245.0, + "repeat_count": 1.0, + "routers_loss": 0.0009552660631015897, + "skip_count": 0.0, + "step": 9432, + "text_loss": 0.6633321642875671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 3.505480901024677e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15217449.0, + "repeat_count": 0.0, + "routers_loss": 0.005598205607384443, + "skip_count": 2.0, + "step": 9434, + "text_loss": 0.545702338218689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 3.494104655035213e-05, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 15220391.0, + "repeat_count": 0.0, + "routers_loss": 0.0154950562864542, + "skip_count": 4.0, + "step": 9436, + "text_loss": 0.211164191365242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 3.4827462301925735e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15224061.0, + "repeat_count": 0.0, + "routers_loss": 0.001531782210804522, + "skip_count": 0.0, + "step": 9438, + "text_loss": 0.49369096755981445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 3.471405630849328e-05, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 15227586.0, + "repeat_count": 0.0, + "routers_loss": 0.004152537789195776, + "skip_count": 1.0, + "step": 9440, + "text_loss": 0.1624782234430313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 3.4600828613512156e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15230713.0, + "repeat_count": 0.0, + "routers_loss": 0.0026113570202142, + "skip_count": 0.0, + "step": 9442, + "text_loss": 0.1921689808368683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 44.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 3.44877792603715e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 15233925.0, + "repeat_count": 0.0, + "routers_loss": 0.008077848702669144, + "skip_count": 3.0, + "step": 9444, + "text_loss": 0.32417818903923035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 3.437490829239193e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15236684.0, + "repeat_count": 0.0, + "routers_loss": 0.0005273211863823235, + "skip_count": 0.0, + "step": 9446, + "text_loss": 0.3497772812843323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 3.4262215752825895e-05, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 15239866.0, + "repeat_count": 0.0, + "routers_loss": 0.0015295564662665129, + "skip_count": 0.0, + "step": 9448, + "text_loss": 0.7613807320594788 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 3.414970168485737e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15243615.0, + "repeat_count": 1.0, + "routers_loss": 0.0039047773461788893, + "skip_count": 0.0, + "step": 9450, + "text_loss": 0.3325706720352173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.375697094217784, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 3.403736613160191e-05, + "loss": 0.0049, + "macro_f1": 0.32098764181137085, + "num_tokens": 15246714.0, + "repeat_count": 0.0, + "routers_loss": 0.0300968699157238, + "skip_count": 2.0, + "step": 9452, + "text_loss": 0.3441869020462036 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 3.392520913610681e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15249520.0, + "repeat_count": 1.0, + "routers_loss": 0.0037529836408793926, + "skip_count": 0.0, + "step": 9454, + "text_loss": 0.5083104968070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 3.381323074135073e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 15252527.0, + "repeat_count": 0.0, + "routers_loss": 0.0019368440844118595, + "skip_count": 2.0, + "step": 9456, + "text_loss": 0.49744489789009094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 3.3701430990244085e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15255330.0, + "repeat_count": 0.0, + "routers_loss": 0.0033424650318920612, + "skip_count": 1.0, + "step": 9458, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 3.35898099256286e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15257961.0, + "repeat_count": 0.0, + "routers_loss": 0.0006928095244802535, + "skip_count": 0.0, + "step": 9460, + "text_loss": 0.5270714163780212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 3.347836759027789e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15261137.0, + "repeat_count": 0.0, + "routers_loss": 0.0030718250200152397, + "skip_count": 2.0, + "step": 9462, + "text_loss": 0.11651179939508438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.43205165835045, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 3.33671040268968e-05, + "loss": 0.0064, + "macro_f1": 0.6601307392120361, + "num_tokens": 15264234.0, + "repeat_count": 1.0, + "routers_loss": 0.03508305177092552, + "skip_count": 2.0, + "step": 9464, + "text_loss": 0.14562347531318665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.441444085705896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 3.3256019278121717e-05, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 15267047.0, + "repeat_count": 0.0, + "routers_loss": 0.008365205489099026, + "skip_count": 1.0, + "step": 9466, + "text_loss": 0.8550931215286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 3.3145113386520485e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15270442.0, + "repeat_count": 0.0, + "routers_loss": 0.0036910634953528643, + "skip_count": 0.0, + "step": 9468, + "text_loss": 0.24741731584072113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 3.30343863945925e-05, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 15273845.0, + "repeat_count": 0.0, + "routers_loss": 0.0014966290909796953, + "skip_count": 0.0, + "step": 9470, + "text_loss": 0.5137372612953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 3.2923838344768534e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15277940.0, + "repeat_count": 0.0, + "routers_loss": 0.0028104602824896574, + "skip_count": 0.0, + "step": 9472, + "text_loss": 0.5737728476524353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 3.281346927941087e-05, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 15281640.0, + "repeat_count": 0.0, + "routers_loss": 0.007870957255363464, + "skip_count": 2.0, + "step": 9474, + "text_loss": 0.27684518694877625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 3.270327924081301e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15284877.0, + "repeat_count": 0.0, + "routers_loss": 0.006224945653229952, + "skip_count": 0.0, + "step": 9476, + "text_loss": 0.35599255561828613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 3.259326827120013e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15287945.0, + "repeat_count": 0.0, + "routers_loss": 0.001179040758870542, + "skip_count": 0.0, + "step": 9478, + "text_loss": 0.26802319288253784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 3.2483436412728553e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 15290754.0, + "repeat_count": 0.0, + "routers_loss": 0.001992281526327133, + "skip_count": 0.0, + "step": 9480, + "text_loss": 0.40124714374542236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 3.2373783707486057e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15294841.0, + "repeat_count": 0.0, + "routers_loss": 0.0012830843916162848, + "skip_count": 0.0, + "step": 9482, + "text_loss": 0.6739225387573242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 3.226431019749171e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15298397.0, + "repeat_count": 0.0, + "routers_loss": 0.003624147269874811, + "skip_count": 2.0, + "step": 9484, + "text_loss": 0.5250326991081238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 3.2155015924696105e-05, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 15301499.0, + "repeat_count": 0.0, + "routers_loss": 0.0019682408310472965, + "skip_count": 0.0, + "step": 9486, + "text_loss": 0.5574567317962646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 3.204590093098098e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 15304531.0, + "repeat_count": 0.0, + "routers_loss": 0.002245094161480665, + "skip_count": 0.0, + "step": 9488, + "text_loss": 0.4065501093864441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 3.1936965258159366e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15307826.0, + "repeat_count": 0.0, + "routers_loss": 0.002919224789366126, + "skip_count": 1.0, + "step": 9490, + "text_loss": 0.5183609127998352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 3.1828208947975615e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15311420.0, + "repeat_count": 0.0, + "routers_loss": 0.004961747210472822, + "skip_count": 1.0, + "step": 9492, + "text_loss": 0.1962234377861023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 3.171963204210537e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15314196.0, + "repeat_count": 0.0, + "routers_loss": 0.0026044815313071012, + "skip_count": 0.0, + "step": 9494, + "text_loss": 0.223251610994339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 3.161123458215553e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15317174.0, + "repeat_count": 0.0, + "routers_loss": 0.0029661289881914854, + "skip_count": 0.0, + "step": 9496, + "text_loss": 0.32970958948135376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 3.150301660966415e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 15320343.0, + "repeat_count": 0.0, + "routers_loss": 0.0011696632718667388, + "skip_count": 0.0, + "step": 9498, + "text_loss": 0.8590811491012573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 3.13949781661006e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15324138.0, + "repeat_count": 0.0, + "routers_loss": 0.0015035583637654781, + "skip_count": 0.0, + "step": 9500, + "text_loss": 0.6658036708831787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 3.1287119292865375e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15328395.0, + "repeat_count": 0.0, + "routers_loss": 0.001930502592585981, + "skip_count": 0.0, + "step": 9502, + "text_loss": 0.4104210138320923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 3.117944003129025e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15332196.0, + "repeat_count": 0.0, + "routers_loss": 0.0010025398805737495, + "skip_count": 0.0, + "step": 9504, + "text_loss": 0.7272399663925171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 3.107194042263806e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15335253.0, + "repeat_count": 1.0, + "routers_loss": 0.004520092159509659, + "skip_count": 0.0, + "step": 9506, + "text_loss": 0.29173022508621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 3.096462050810284e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15338129.0, + "repeat_count": 0.0, + "routers_loss": 0.0009707154240459204, + "skip_count": 0.0, + "step": 9508, + "text_loss": 0.6530287861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 3.0857480328809916e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15341487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008689566748216748, + "skip_count": 0.0, + "step": 9510, + "text_loss": 0.36988505721092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 3.0750519925815565e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 15344460.0, + "repeat_count": 0.0, + "routers_loss": 0.0022587007842957973, + "skip_count": 0.0, + "step": 9512, + "text_loss": 0.2447768598794937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 3.064373934010711e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15348135.0, + "repeat_count": 0.0, + "routers_loss": 0.001986770424991846, + "skip_count": 0.0, + "step": 9514, + "text_loss": 0.43159469962120056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 3.053713861260321e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15351073.0, + "repeat_count": 0.0, + "routers_loss": 0.0003514432755764574, + "skip_count": 0.0, + "step": 9516, + "text_loss": 0.3638324737548828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 3.043071778415335e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15353633.0, + "repeat_count": 0.0, + "routers_loss": 0.003395392093807459, + "skip_count": 0.0, + "step": 9518, + "text_loss": 0.5728140473365784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 3.03244768955383e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15357322.0, + "repeat_count": 0.0, + "routers_loss": 0.0016641782131046057, + "skip_count": 0.0, + "step": 9520, + "text_loss": 0.666814386844635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 3.021841598746966e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15360771.0, + "repeat_count": 0.0, + "routers_loss": 0.0024721708614379168, + "skip_count": 0.0, + "step": 9522, + "text_loss": 0.7148030400276184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 3.01125351005902e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15364281.0, + "repeat_count": 0.0, + "routers_loss": 0.004133665468543768, + "skip_count": 0.0, + "step": 9524, + "text_loss": 0.2985752820968628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 3.0006834275473737e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15367354.0, + "repeat_count": 0.0, + "routers_loss": 0.003016186412423849, + "skip_count": 1.0, + "step": 9526, + "text_loss": 0.22689883410930634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 44.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01531982421875, + "learning_rate": 2.9901313552624932e-05, + "loss": 0.003, + "macro_f1": 1.0, + "num_tokens": 15371027.0, + "repeat_count": 1.0, + "routers_loss": 0.015333639457821846, + "skip_count": 7.0, + "step": 9528, + "text_loss": 0.8308720588684082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 2.97959729724796e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15373948.0, + "repeat_count": 0.0, + "routers_loss": 0.001420815708115697, + "skip_count": 0.0, + "step": 9530, + "text_loss": 0.5439777970314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 2.9690812575404456e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15377366.0, + "repeat_count": 0.0, + "routers_loss": 0.0007130459416657686, + "skip_count": 0.0, + "step": 9532, + "text_loss": 0.45405295491218567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.76078661579102, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 2.95858324016971e-05, + "loss": 0.0067, + "macro_f1": 0.3272727429866791, + "num_tokens": 15380115.0, + "repeat_count": 1.0, + "routers_loss": 0.04256885498762131, + "skip_count": 0.0, + "step": 9534, + "text_loss": 0.39998912811279297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 2.9481032491586178e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15383205.0, + "repeat_count": 0.0, + "routers_loss": 0.004944019019603729, + "skip_count": 4.0, + "step": 9536, + "text_loss": 0.1882237195968628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 2.937641288523124e-05, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 15386619.0, + "repeat_count": 0.0, + "routers_loss": 0.007820523343980312, + "skip_count": 1.0, + "step": 9538, + "text_loss": 0.26401394605636597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 2.9271973622722603e-05, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 15389135.0, + "repeat_count": 0.0, + "routers_loss": 0.0010751578956842422, + "skip_count": 0.0, + "step": 9540, + "text_loss": 0.39813846349716187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 2.9167714744081643e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15392150.0, + "repeat_count": 0.0, + "routers_loss": 0.0031554463785141706, + "skip_count": 2.0, + "step": 9542, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 2.9063636289260677e-05, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 15394974.0, + "repeat_count": 0.0, + "routers_loss": 0.00287301791831851, + "skip_count": 1.0, + "step": 9544, + "text_loss": 0.176493301987648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 2.8959738298142635e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15398432.0, + "repeat_count": 0.0, + "routers_loss": 0.0011708475649356842, + "skip_count": 0.0, + "step": 9546, + "text_loss": 0.8762983083724976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 2.885602081054145e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15401121.0, + "repeat_count": 0.0, + "routers_loss": 0.003167103510349989, + "skip_count": 1.0, + "step": 9548, + "text_loss": 0.2538717985153198 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 2.8752483866201885e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 15404105.0, + "repeat_count": 1.0, + "routers_loss": 0.007552143186330795, + "skip_count": 5.0, + "step": 9550, + "text_loss": 0.37045153975486755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 2.8649127504799423e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 15407232.0, + "repeat_count": 1.0, + "routers_loss": 0.007718692068010569, + "skip_count": 2.0, + "step": 9552, + "text_loss": 0.15780900418758392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 2.8545951765940547e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15410425.0, + "repeat_count": 0.0, + "routers_loss": 0.0003527951193973422, + "skip_count": 0.0, + "step": 9554, + "text_loss": 0.5931823253631592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 2.8442956689162193e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15413724.0, + "repeat_count": 0.0, + "routers_loss": 0.00146177364513278, + "skip_count": 0.0, + "step": 9556, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 2.8340142313932448e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15416776.0, + "repeat_count": 0.0, + "routers_loss": 0.0010256811510771513, + "skip_count": 0.0, + "step": 9558, + "text_loss": 0.40814271569252014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 2.823750867964997e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15419815.0, + "repeat_count": 0.0, + "routers_loss": 0.0047921910881996155, + "skip_count": 0.0, + "step": 9560, + "text_loss": 0.28953713178634644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 2.8135055825644072e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 15422806.0, + "repeat_count": 0.0, + "routers_loss": 0.002010057680308819, + "skip_count": 1.0, + "step": 9562, + "text_loss": 0.8377944231033325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 2.803278379117491e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15425405.0, + "repeat_count": 0.0, + "routers_loss": 0.005009239539504051, + "skip_count": 1.0, + "step": 9564, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 2.793069261543335e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 15428233.0, + "repeat_count": 0.0, + "routers_loss": 0.007967893034219742, + "skip_count": 2.0, + "step": 9566, + "text_loss": 0.49891290068626404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 2.7828782337540882e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 15431095.0, + "repeat_count": 2.0, + "routers_loss": 0.00638923142105341, + "skip_count": 4.0, + "step": 9568, + "text_loss": 0.30928006768226624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 44.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 2.7727052996549763e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15434933.0, + "repeat_count": 0.0, + "routers_loss": 0.0060427505522966385, + "skip_count": 3.0, + "step": 9570, + "text_loss": 0.21274788677692413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 2.762550463144281e-05, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 15437655.0, + "repeat_count": 0.0, + "routers_loss": 0.0012480237055569887, + "skip_count": 0.0, + "step": 9572, + "text_loss": 0.31049492955207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 2.7524137281133567e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 15440643.0, + "repeat_count": 0.0, + "routers_loss": 0.005919245071709156, + "skip_count": 0.0, + "step": 9574, + "text_loss": 0.16459886729717255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 2.7422950984466233e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 15443532.0, + "repeat_count": 0.0, + "routers_loss": 0.0061412835493683815, + "skip_count": 2.0, + "step": 9576, + "text_loss": 0.7102797031402588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 2.7321945780215573e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15447027.0, + "repeat_count": 0.0, + "routers_loss": 0.001149018993601203, + "skip_count": 0.0, + "step": 9578, + "text_loss": 0.22778025269508362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 2.722112170708696e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15450173.0, + "repeat_count": 0.0, + "routers_loss": 0.002216014079749584, + "skip_count": 0.0, + "step": 9580, + "text_loss": 0.21447396278381348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 2.7120478803716264e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15452838.0, + "repeat_count": 0.0, + "routers_loss": 0.00498749827966094, + "skip_count": 0.0, + "step": 9582, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 2.7020017108670246e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 15455928.0, + "repeat_count": 1.0, + "routers_loss": 0.005886784754693508, + "skip_count": 3.0, + "step": 9584, + "text_loss": 0.3929266631603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 2.691973666044589e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15459447.0, + "repeat_count": 0.0, + "routers_loss": 0.0029895263724029064, + "skip_count": 1.0, + "step": 9586, + "text_loss": 0.27535343170166016 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 2.681963749747085e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15462340.0, + "repeat_count": 1.0, + "routers_loss": 0.0038893253076821566, + "skip_count": 0.0, + "step": 9588, + "text_loss": 0.6950465440750122 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 2.671971965810338e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15465432.0, + "repeat_count": 1.0, + "routers_loss": 0.0016947018448263407, + "skip_count": 0.0, + "step": 9590, + "text_loss": 0.41451266407966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 2.6619983180632134e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15468300.0, + "repeat_count": 0.0, + "routers_loss": 0.0011597154662013054, + "skip_count": 0.0, + "step": 9592, + "text_loss": 0.5846080780029297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 2.6520428103276316e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15471084.0, + "repeat_count": 0.0, + "routers_loss": 0.005555236246436834, + "skip_count": 2.0, + "step": 9594, + "text_loss": 0.4151473939418793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 2.6421054464185633e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15474348.0, + "repeat_count": 0.0, + "routers_loss": 0.0015279205981642008, + "skip_count": 0.0, + "step": 9596, + "text_loss": 0.28742483258247375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 2.6321862301440234e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15477493.0, + "repeat_count": 0.0, + "routers_loss": 0.0019169533625245094, + "skip_count": 0.0, + "step": 9598, + "text_loss": 0.338019460439682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 2.6222851653050773e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15480257.0, + "repeat_count": 0.0, + "routers_loss": 0.0015131557593122125, + "skip_count": 1.0, + "step": 9600, + "text_loss": 0.5982558727264404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 2.612402255695828e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15482838.0, + "repeat_count": 0.0, + "routers_loss": 0.0026768618263304234, + "skip_count": 0.0, + "step": 9602, + "text_loss": 0.32012176513671875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 2.6025375051034306e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15485746.0, + "repeat_count": 0.0, + "routers_loss": 0.002152341417968273, + "skip_count": 0.0, + "step": 9604, + "text_loss": 0.16942192614078522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 2.5926909173080658e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15488669.0, + "repeat_count": 0.0, + "routers_loss": 0.003325721947476268, + "skip_count": 3.0, + "step": 9606, + "text_loss": 0.47950080037117004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 2.582862496082977e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15491512.0, + "repeat_count": 0.0, + "routers_loss": 0.0023114588111639023, + "skip_count": 1.0, + "step": 9608, + "text_loss": 0.3907585144042969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 2.5730522451944292e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 15494479.0, + "repeat_count": 0.0, + "routers_loss": 0.003140041371807456, + "skip_count": 2.0, + "step": 9610, + "text_loss": 0.198005810379982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 2.5632601684017264e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15497900.0, + "repeat_count": 0.0, + "routers_loss": 0.0015117402654141188, + "skip_count": 0.0, + "step": 9612, + "text_loss": 0.874154269695282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 2.5534862694572114e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 15501817.0, + "repeat_count": 0.0, + "routers_loss": 0.00551232136785984, + "skip_count": 2.0, + "step": 9614, + "text_loss": 0.1933375597000122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 2.543730552106266e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 15504872.0, + "repeat_count": 0.0, + "routers_loss": 0.001090583624318242, + "skip_count": 0.0, + "step": 9616, + "text_loss": 0.4030717611312866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 2.533993020087294e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15507727.0, + "repeat_count": 0.0, + "routers_loss": 0.007001800462603569, + "skip_count": 0.0, + "step": 9618, + "text_loss": 0.4812186062335968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 2.5242736771317333e-05, + "loss": 0.0025, + "macro_f1": 0.3333333432674408, + "num_tokens": 15510689.0, + "repeat_count": 0.0, + "routers_loss": 0.0016861478798091412, + "skip_count": 0.0, + "step": 9620, + "text_loss": 0.4578339457511902 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.17375990607572, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 2.514572526964065e-05, + "loss": 0.0068, + "macro_f1": 0.8817967176437378, + "num_tokens": 15513419.0, + "repeat_count": 2.0, + "routers_loss": 0.050852373242378235, + "skip_count": 3.0, + "step": 9622, + "text_loss": 0.4038950204849243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 2.5048895733017772e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15516289.0, + "repeat_count": 0.0, + "routers_loss": 0.0015001936117187142, + "skip_count": 0.0, + "step": 9624, + "text_loss": 0.8331962823867798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 2.4952248198554073e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15519476.0, + "repeat_count": 0.0, + "routers_loss": 0.0009114370332099497, + "skip_count": 1.0, + "step": 9626, + "text_loss": 0.4997985363006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 2.4855782703284925e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15523363.0, + "repeat_count": 0.0, + "routers_loss": 0.0011186953634023666, + "skip_count": 0.0, + "step": 9628, + "text_loss": 0.2572024464607239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 45.211329615497505, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0262451171875, + "learning_rate": 2.4759499284176145e-05, + "loss": 0.0059, + "macro_f1": 0.6122449040412903, + "num_tokens": 15526289.0, + "repeat_count": 0.0, + "routers_loss": 0.019600817933678627, + "skip_count": 4.0, + "step": 9630, + "text_loss": 0.6323924660682678 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 45.22072204285295, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 2.466339797812378e-05, + "loss": 0.0065, + "macro_f1": 0.9265305995941162, + "num_tokens": 15530260.0, + "repeat_count": 3.0, + "routers_loss": 0.02459629252552986, + "skip_count": 1.0, + "step": 9632, + "text_loss": 0.1824527233839035 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 2.4567478821954038e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 15533916.0, + "repeat_count": 2.0, + "routers_loss": 0.009077859111130238, + "skip_count": 2.0, + "step": 9634, + "text_loss": 0.4518069326877594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 2.4471741852423235e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15536958.0, + "repeat_count": 1.0, + "routers_loss": 0.002355317585170269, + "skip_count": 0.0, + "step": 9636, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 2.437618710621803e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15540544.0, + "repeat_count": 0.0, + "routers_loss": 0.001198371173813939, + "skip_count": 0.0, + "step": 9638, + "text_loss": 0.4845949709415436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 2.4280814619955128e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15543355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009287866414524615, + "skip_count": 0.0, + "step": 9640, + "text_loss": 0.5979563593864441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 2.4185624430181464e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15547215.0, + "repeat_count": 0.0, + "routers_loss": 0.0028763876762241125, + "skip_count": 0.0, + "step": 9642, + "text_loss": 0.16279318928718567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 2.4090616573374135e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15550412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013361044693738222, + "skip_count": 0.0, + "step": 9644, + "text_loss": 0.2864333987236023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 45.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 2.3995791085940244e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15553660.0, + "repeat_count": 2.0, + "routers_loss": 0.0019316677935421467, + "skip_count": 0.0, + "step": 9646, + "text_loss": 0.6333117485046387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 2.390114800421722e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15556287.0, + "repeat_count": 0.0, + "routers_loss": 0.0011288017267361283, + "skip_count": 1.0, + "step": 9648, + "text_loss": 0.6050677299499512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 2.380668736447239e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15559246.0, + "repeat_count": 0.0, + "routers_loss": 0.0014249378582462668, + "skip_count": 0.0, + "step": 9650, + "text_loss": 0.9484158754348755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.371240920290324e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 15562251.0, + "repeat_count": 1.0, + "routers_loss": 0.00741320988163352, + "skip_count": 4.0, + "step": 9652, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 2.361831355563726e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15565704.0, + "repeat_count": 1.0, + "routers_loss": 0.000942508690059185, + "skip_count": 0.0, + "step": 9654, + "text_loss": 0.6523539423942566 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 2.352440045873233e-05, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 15568797.0, + "repeat_count": 1.0, + "routers_loss": 0.0064352210611104965, + "skip_count": 4.0, + "step": 9656, + "text_loss": 0.3206343650817871 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 2.3430669948175943e-05, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 15571855.0, + "repeat_count": 1.0, + "routers_loss": 0.0013390982057899237, + "skip_count": 0.0, + "step": 9658, + "text_loss": 0.8397402763366699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 2.3337122059885806e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15575379.0, + "repeat_count": 0.0, + "routers_loss": 0.0012212366564199328, + "skip_count": 0.0, + "step": 9660, + "text_loss": 0.5116108655929565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 2.324375682970975e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15578108.0, + "repeat_count": 0.0, + "routers_loss": 0.003829900873824954, + "skip_count": 0.0, + "step": 9662, + "text_loss": 0.1423535794019699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 2.3150574293425376e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 15581830.0, + "repeat_count": 1.0, + "routers_loss": 0.012756838463246822, + "skip_count": 1.0, + "step": 9664, + "text_loss": 0.24676625430583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 2.3057574486740507e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15584872.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642473828047514, + "skip_count": 0.0, + "step": 9666, + "text_loss": 0.4851650893688202 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 2.2964757445292806e-05, + "loss": 0.0029, + "macro_f1": 1.0, + "num_tokens": 15588000.0, + "repeat_count": 2.0, + "routers_loss": 0.007441115565598011, + "skip_count": 3.0, + "step": 9668, + "text_loss": 0.6416954398155212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017333984375, + "learning_rate": 2.287212320464993e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15591065.0, + "repeat_count": 0.0, + "routers_loss": 0.0015504831681028008, + "skip_count": 0.0, + "step": 9670, + "text_loss": 0.5852687358856201 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 2.2779671800309433e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 15594631.0, + "repeat_count": 2.0, + "routers_loss": 0.005648284684866667, + "skip_count": 2.0, + "step": 9672, + "text_loss": 0.7172279357910156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 2.2687403267699024e-05, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 15598664.0, + "repeat_count": 1.0, + "routers_loss": 0.003756999270990491, + "skip_count": 2.0, + "step": 9674, + "text_loss": 0.18986566364765167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 2.259531764217604e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15601616.0, + "repeat_count": 0.0, + "routers_loss": 0.002155672525987029, + "skip_count": 0.0, + "step": 9676, + "text_loss": 0.4410690367221832 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 2.250341495902797e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 15604291.0, + "repeat_count": 1.0, + "routers_loss": 0.0020037787035107613, + "skip_count": 0.0, + "step": 9678, + "text_loss": 0.5565816164016724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 2.241169525347203e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15607203.0, + "repeat_count": 0.0, + "routers_loss": 0.0014305647928267717, + "skip_count": 0.0, + "step": 9680, + "text_loss": 0.4879189729690552 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 2.2320158560655447e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 15610475.0, + "repeat_count": 1.0, + "routers_loss": 0.016029199585318565, + "skip_count": 3.0, + "step": 9682, + "text_loss": 0.36342933773994446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 2.2228804915655153e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15613810.0, + "repeat_count": 0.0, + "routers_loss": 0.0023584216833114624, + "skip_count": 0.0, + "step": 9684, + "text_loss": 0.18480375409126282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 2.2137634353478043e-05, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 15617854.0, + "repeat_count": 0.0, + "routers_loss": 0.004325680434703827, + "skip_count": 1.0, + "step": 9686, + "text_loss": 0.5345974564552307 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 2.2046646909060996e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15620874.0, + "repeat_count": 3.0, + "routers_loss": 0.006946994923055172, + "skip_count": 0.0, + "step": 9688, + "text_loss": 0.29016008973121643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 2.195584261727046e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15623875.0, + "repeat_count": 0.0, + "routers_loss": 0.0034732038620859385, + "skip_count": 1.0, + "step": 9690, + "text_loss": 0.2831312119960785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 2.1865221512902766e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 15626371.0, + "repeat_count": 0.0, + "routers_loss": 0.002495788736268878, + "skip_count": 1.0, + "step": 9692, + "text_loss": 0.6090453267097473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.511887290871734, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 2.1774783630684246e-05, + "loss": 0.0076, + "macro_f1": 0.6598639488220215, + "num_tokens": 15630129.0, + "repeat_count": 3.0, + "routers_loss": 0.017551302909851074, + "skip_count": 1.0, + "step": 9694, + "text_loss": 0.5127915740013123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 2.168452900527068e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15633179.0, + "repeat_count": 0.0, + "routers_loss": 0.0004413482965901494, + "skip_count": 0.0, + "step": 9696, + "text_loss": 0.5901434421539307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 2.159445767124796e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15636508.0, + "repeat_count": 0.0, + "routers_loss": 0.005992567166686058, + "skip_count": 1.0, + "step": 9698, + "text_loss": 0.8493689298629761 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 2.1504569663131523e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15639371.0, + "repeat_count": 1.0, + "routers_loss": 0.0033268092665821314, + "skip_count": 0.0, + "step": 9700, + "text_loss": 0.2814267873764038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.1414865015366548e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15643025.0, + "repeat_count": 0.0, + "routers_loss": 0.004418607335537672, + "skip_count": 0.0, + "step": 9702, + "text_loss": 0.2619725167751312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 45.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 2.1325343762328197e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15646996.0, + "repeat_count": 0.0, + "routers_loss": 0.0050115580670535564, + "skip_count": 4.0, + "step": 9704, + "text_loss": 0.8204038143157959 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 2.123600593832109e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15650194.0, + "repeat_count": 0.0, + "routers_loss": 0.0018730501178652048, + "skip_count": 1.0, + "step": 9706, + "text_loss": 0.694500744342804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 2.1146851577579673e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15653743.0, + "repeat_count": 0.0, + "routers_loss": 0.0016657712403684855, + "skip_count": 0.0, + "step": 9708, + "text_loss": 0.8211735486984253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 2.1057880714268064e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15657325.0, + "repeat_count": 0.0, + "routers_loss": 0.0029736643191426992, + "skip_count": 0.0, + "step": 9710, + "text_loss": 0.2846751809120178 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 2.0969093382479987e-05, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 15660522.0, + "repeat_count": 1.0, + "routers_loss": 0.01233653537929058, + "skip_count": 4.0, + "step": 9712, + "text_loss": 0.23991759121418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 2.0880489616239062e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15663254.0, + "repeat_count": 0.0, + "routers_loss": 0.0012792183551937342, + "skip_count": 0.0, + "step": 9714, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 2.0792069449498297e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 15666283.0, + "repeat_count": 0.0, + "routers_loss": 0.0033134319819509983, + "skip_count": 0.0, + "step": 9716, + "text_loss": 0.4161235988140106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 45.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 2.0703832916140476e-05, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 15669774.0, + "repeat_count": 2.0, + "routers_loss": 0.006201022770255804, + "skip_count": 1.0, + "step": 9718, + "text_loss": 0.42691144347190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 2.061578004997805e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 15672943.0, + "repeat_count": 0.0, + "routers_loss": 0.0033355073537677526, + "skip_count": 1.0, + "step": 9720, + "text_loss": 0.9724727869033813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 2.0527910884753033e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15677847.0, + "repeat_count": 0.0, + "routers_loss": 0.0019593657925724983, + "skip_count": 0.0, + "step": 9722, + "text_loss": 0.417218416929245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.0440225454137097e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15681460.0, + "repeat_count": 0.0, + "routers_loss": 0.007862947881221771, + "skip_count": 2.0, + "step": 9724, + "text_loss": 0.24983589351177216 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 2.0352723791731364e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 15685496.0, + "repeat_count": 1.0, + "routers_loss": 0.004811233840882778, + "skip_count": 0.0, + "step": 9726, + "text_loss": 0.32930606603622437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8571428656578064, + "avg_layers": 22.0, + "epoch": 45.671558555914295, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 0.9230769276618958, + "grad_norm": 0.045166015625, + "learning_rate": 2.0265405931066626e-05, + "loss": 0.0057, + "macro_f1": 0.633273720741272, + "num_tokens": 15688661.0, + "repeat_count": 0.0, + "routers_loss": 0.02648334763944149, + "skip_count": 7.0, + "step": 9728, + "text_loss": 0.42316386103630066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 2.0178271905603395e-05, + "loss": 0.0054, + "macro_f1": 0.6598639488220215, + "num_tokens": 15692778.0, + "repeat_count": 1.0, + "routers_loss": 0.04439396783709526, + "skip_count": 3.0, + "step": 9730, + "text_loss": 0.32248371839523315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 2.0091321748731517e-05, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 15695821.0, + "repeat_count": 0.0, + "routers_loss": 0.0020437403582036495, + "skip_count": 2.0, + "step": 9732, + "text_loss": 0.5959160923957825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 2.000455549377045e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15699324.0, + "repeat_count": 0.0, + "routers_loss": 0.0002844796108547598, + "skip_count": 0.0, + "step": 9734, + "text_loss": 0.45465928316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.9917973173969204e-05, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 15702044.0, + "repeat_count": 0.0, + "routers_loss": 0.003548701060935855, + "skip_count": 0.0, + "step": 9736, + "text_loss": 0.7129027843475342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 45.71852069269152, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0279541015625, + "learning_rate": 1.9831574822506248e-05, + "loss": 0.0089, + "macro_f1": 0.6289562582969666, + "num_tokens": 15705474.0, + "repeat_count": 0.0, + "routers_loss": 0.023800918832421303, + "skip_count": 6.0, + "step": 9738, + "text_loss": 0.28479668498039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 1.9745360472489648e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15708323.0, + "repeat_count": 0.0, + "routers_loss": 0.01043168269097805, + "skip_count": 2.0, + "step": 9740, + "text_loss": 0.4760739803314209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 1.9659330156956867e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15711390.0, + "repeat_count": 0.0, + "routers_loss": 0.006430295296013355, + "skip_count": 2.0, + "step": 9742, + "text_loss": 0.13933971524238586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.957348390887487e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15714077.0, + "repeat_count": 0.0, + "routers_loss": 0.005738302133977413, + "skip_count": 3.0, + "step": 9744, + "text_loss": 0.49661460518836975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.948782176114017e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15716818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011776578612625599, + "skip_count": 0.0, + "step": 9746, + "text_loss": 0.36066678166389465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 1.9402343746578567e-05, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 15720756.0, + "repeat_count": 0.0, + "routers_loss": 0.0005322427023202181, + "skip_count": 0.0, + "step": 9748, + "text_loss": 0.5549091696739197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.931704989794547e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15724516.0, + "repeat_count": 0.0, + "routers_loss": 0.001399765140376985, + "skip_count": 0.0, + "step": 9750, + "text_loss": 0.21269696950912476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.9231940247925572e-05, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 15727142.0, + "repeat_count": 0.0, + "routers_loss": 0.0018337799701839685, + "skip_count": 1.0, + "step": 9752, + "text_loss": 0.18105024099349976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 1.914701482913317e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 15730023.0, + "repeat_count": 0.0, + "routers_loss": 0.0010057559702545404, + "skip_count": 0.0, + "step": 9754, + "text_loss": 0.477859228849411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 1.906227367411173e-05, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 15733108.0, + "repeat_count": 0.0, + "routers_loss": 0.002486895304173231, + "skip_count": 3.0, + "step": 9756, + "text_loss": 0.4802452027797699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.8977716815334335e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15736130.0, + "repeat_count": 1.0, + "routers_loss": 0.004353616386651993, + "skip_count": 0.0, + "step": 9758, + "text_loss": 0.5479429960250854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 1.8893344285203228e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15738691.0, + "repeat_count": 0.0, + "routers_loss": 0.0031500225886702538, + "skip_count": 1.0, + "step": 9760, + "text_loss": 0.6871381402015686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 1.8809156116050164e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15741682.0, + "repeat_count": 0.0, + "routers_loss": 0.0023419202771037817, + "skip_count": 0.0, + "step": 9762, + "text_loss": 0.6725277900695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.8725152340136163e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15745314.0, + "repeat_count": 0.0, + "routers_loss": 0.0018769606249406934, + "skip_count": 0.0, + "step": 9764, + "text_loss": 0.4549144506454468 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 1.864133298965176e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 15747982.0, + "repeat_count": 1.0, + "routers_loss": 0.0030958254355937243, + "skip_count": 2.0, + "step": 9766, + "text_loss": 0.4970727264881134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.8557698096716534e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15750453.0, + "repeat_count": 0.0, + "routers_loss": 0.0020812496077269316, + "skip_count": 1.0, + "step": 9768, + "text_loss": 0.7540801167488098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 1.847424769337963e-05, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 15753857.0, + "repeat_count": 0.0, + "routers_loss": 0.0031040434259921312, + "skip_count": 0.0, + "step": 9770, + "text_loss": 0.5154248476028442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 1.8390981811619356e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15756742.0, + "repeat_count": 0.0, + "routers_loss": 0.002128311200067401, + "skip_count": 0.0, + "step": 9772, + "text_loss": 0.7327702045440674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.8307900483343354e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15759833.0, + "repeat_count": 0.0, + "routers_loss": 0.003279880853369832, + "skip_count": 1.0, + "step": 9774, + "text_loss": 0.2673797607421875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.8225003740388545e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15762768.0, + "repeat_count": 0.0, + "routers_loss": 0.004170822445303202, + "skip_count": 0.0, + "step": 9776, + "text_loss": 0.1820847988128662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 45.90636923980041, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.0194091796875, + "learning_rate": 1.8142291614521132e-05, + "loss": 0.0045, + "macro_f1": 0.9713832139968872, + "num_tokens": 15766965.0, + "repeat_count": 1.0, + "routers_loss": 0.022715313360095024, + "skip_count": 9.0, + "step": 9778, + "text_loss": 0.5590897798538208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 1.8059764137436596e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15770199.0, + "repeat_count": 0.0, + "routers_loss": 0.007280370220541954, + "skip_count": 1.0, + "step": 9780, + "text_loss": 0.28117987513542175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 1.7977421340759582e-05, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 15773367.0, + "repeat_count": 0.0, + "routers_loss": 0.003529706271365285, + "skip_count": 0.0, + "step": 9782, + "text_loss": 0.18752245604991913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 1.7895263256044013e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 15776976.0, + "repeat_count": 0.0, + "routers_loss": 0.0025916248559951782, + "skip_count": 1.0, + "step": 9784, + "text_loss": 0.6330561637878418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 1.781328991477299e-05, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 15780848.0, + "repeat_count": 0.0, + "routers_loss": 0.0049234069883823395, + "skip_count": 1.0, + "step": 9786, + "text_loss": 0.15685316920280457 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.95333137657764, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 1.7731501348358882e-05, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 15783808.0, + "repeat_count": 2.0, + "routers_loss": 0.011918511241674423, + "skip_count": 1.0, + "step": 9788, + "text_loss": 0.23963648080825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.7649897588143226e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 15787421.0, + "repeat_count": 0.0, + "routers_loss": 0.0018508053617551923, + "skip_count": 0.0, + "step": 9790, + "text_loss": 0.49311593174934387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.7568478665396736e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15790274.0, + "repeat_count": 0.0, + "routers_loss": 0.0006157457246445119, + "skip_count": 0.0, + "step": 9792, + "text_loss": 0.4567435085773468 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 1.7487244611319285e-05, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 15794462.0, + "repeat_count": 3.0, + "routers_loss": 0.0031584864482283592, + "skip_count": 0.0, + "step": 9794, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 1.740619545703992e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 15797775.0, + "repeat_count": 0.0, + "routers_loss": 0.0028455168940126896, + "skip_count": 0.0, + "step": 9796, + "text_loss": 0.1487245261669159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 46.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06201171875, + "learning_rate": 1.7325331233616847e-05, + "loss": 0.0078, + "macro_f1": 0.6122449040412903, + "num_tokens": 15801092.0, + "repeat_count": 0.0, + "routers_loss": 0.02560117095708847, + "skip_count": 4.0, + "step": 9798, + "text_loss": 0.5299228429794312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.00939242735544, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.7244651972037284e-05, + "loss": 0.0046, + "macro_f1": 0.6598639488220215, + "num_tokens": 15804049.0, + "repeat_count": 1.0, + "routers_loss": 0.010446238331496716, + "skip_count": 3.0, + "step": 9800, + "text_loss": 0.6591248512268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 1.7164157703217886e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 15807683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017791346181184053, + "skip_count": 0.0, + "step": 9802, + "text_loss": 0.45421653985977173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 1.7083848458004035e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 15810743.0, + "repeat_count": 0.0, + "routers_loss": 0.0008831496234051883, + "skip_count": 0.0, + "step": 9804, + "text_loss": 0.5535439848899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 1.7003724267170394e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 15813880.0, + "repeat_count": 0.0, + "routers_loss": 0.002800740534439683, + "skip_count": 0.0, + "step": 9806, + "text_loss": 0.5228974223136902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 1.6923785161420845e-05, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 15816808.0, + "repeat_count": 0.0, + "routers_loss": 0.006823428440839052, + "skip_count": 3.0, + "step": 9808, + "text_loss": 0.48018959164619446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 1.6844031171388052e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15819803.0, + "repeat_count": 0.0, + "routers_loss": 0.004808149300515652, + "skip_count": 0.0, + "step": 9810, + "text_loss": 0.31094294786453247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 1.6764462327633955e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15822861.0, + "repeat_count": 0.0, + "routers_loss": 0.0026099751703441143, + "skip_count": 0.0, + "step": 9812, + "text_loss": 0.5534207224845886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 1.668507866064939e-05, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 15825960.0, + "repeat_count": 1.0, + "routers_loss": 0.008356450125575066, + "skip_count": 2.0, + "step": 9814, + "text_loss": 0.40162262320518494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 1.660588020085452e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15828906.0, + "repeat_count": 0.0, + "routers_loss": 0.006548966746777296, + "skip_count": 2.0, + "step": 9816, + "text_loss": 0.2071811705827713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 1.652686697859823e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15831935.0, + "repeat_count": 0.0, + "routers_loss": 0.0007895465241745114, + "skip_count": 0.0, + "step": 9818, + "text_loss": 0.6879562735557556 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 1.6448039024158534e-05, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 15835745.0, + "repeat_count": 1.0, + "routers_loss": 0.00370208453387022, + "skip_count": 2.0, + "step": 9820, + "text_loss": 0.6139163970947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.6369396367742483e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15838373.0, + "repeat_count": 0.0, + "routers_loss": 0.002627170644700527, + "skip_count": 0.0, + "step": 9822, + "text_loss": 0.3881947100162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018798828125, + "learning_rate": 1.6290939039486084e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15841156.0, + "repeat_count": 0.0, + "routers_loss": 0.005191941745579243, + "skip_count": 2.0, + "step": 9824, + "text_loss": 0.6564247608184814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 1.621266706945429e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15843877.0, + "repeat_count": 1.0, + "routers_loss": 0.003889352548867464, + "skip_count": 0.0, + "step": 9826, + "text_loss": 0.7128682136535645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 46.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 1.6134580487641047e-05, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 15846880.0, + "repeat_count": 0.0, + "routers_loss": 0.00674893194809556, + "skip_count": 4.0, + "step": 9828, + "text_loss": 0.30893367528915405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 1.6056679323969425e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15850130.0, + "repeat_count": 0.0, + "routers_loss": 0.0009898045100271702, + "skip_count": 0.0, + "step": 9830, + "text_loss": 0.6550688743591309 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 1.5978963608291154e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15853578.0, + "repeat_count": 1.0, + "routers_loss": 0.0046016750857234, + "skip_count": 0.0, + "step": 9832, + "text_loss": 0.43872204422950745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 1.5901433370387132e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15857939.0, + "repeat_count": 0.0, + "routers_loss": 0.004589201882481575, + "skip_count": 1.0, + "step": 9834, + "text_loss": 0.41940808296203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 1.5824088639967094e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15860584.0, + "repeat_count": 0.0, + "routers_loss": 0.0018899316200986505, + "skip_count": 1.0, + "step": 9836, + "text_loss": 0.5105440616607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 1.5746929446669556e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15864386.0, + "repeat_count": 0.0, + "routers_loss": 0.0006366848247125745, + "skip_count": 0.0, + "step": 9838, + "text_loss": 0.5686481595039368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 1.5669955820062254e-05, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 15869103.0, + "repeat_count": 0.0, + "routers_loss": 0.0043256948702037334, + "skip_count": 1.0, + "step": 9840, + "text_loss": 0.16309607028961182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 1.5593167789641483e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15872384.0, + "repeat_count": 0.0, + "routers_loss": 0.00406000716611743, + "skip_count": 1.0, + "step": 9842, + "text_loss": 0.21662485599517822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 46.21602582917523, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029541015625, + "learning_rate": 1.551656538483259e-05, + "loss": 0.0076, + "macro_f1": 0.5492662787437439, + "num_tokens": 15875261.0, + "repeat_count": 0.0, + "routers_loss": 0.020087692886590958, + "skip_count": 2.0, + "step": 9844, + "text_loss": 0.6189377903938293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 1.5440148634989826e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15878132.0, + "repeat_count": 0.0, + "routers_loss": 0.0005302145145833492, + "skip_count": 0.0, + "step": 9846, + "text_loss": 0.34496018290519714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 1.536391756939609e-05, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 15881381.0, + "repeat_count": 0.0, + "routers_loss": 0.008405420929193497, + "skip_count": 2.0, + "step": 9848, + "text_loss": 0.2865080237388611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 1.528787221726341e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 15884621.0, + "repeat_count": 0.0, + "routers_loss": 0.0016017532907426357, + "skip_count": 0.0, + "step": 9850, + "text_loss": 0.6104921102523804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 1.5212012607732528e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15888157.0, + "repeat_count": 0.0, + "routers_loss": 0.0015318389050662518, + "skip_count": 0.0, + "step": 9852, + "text_loss": 0.2622036933898926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 1.5136338769872915e-05, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 15891080.0, + "repeat_count": 2.0, + "routers_loss": 0.006494096480309963, + "skip_count": 4.0, + "step": 9854, + "text_loss": 0.23415961861610413 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 1.5060850732682928e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 15895486.0, + "repeat_count": 2.0, + "routers_loss": 0.007511078380048275, + "skip_count": 3.0, + "step": 9856, + "text_loss": 0.7389219999313354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.4985548525089709e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15898747.0, + "repeat_count": 0.0, + "routers_loss": 0.004874013364315033, + "skip_count": 2.0, + "step": 9858, + "text_loss": 0.6853085160255432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 1.4910432175949285e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 15902157.0, + "repeat_count": 0.0, + "routers_loss": 0.0009244410903193057, + "skip_count": 0.0, + "step": 9860, + "text_loss": 0.8172202110290527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.4835501714046296e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15905012.0, + "repeat_count": 0.0, + "routers_loss": 0.00456853536888957, + "skip_count": 3.0, + "step": 9862, + "text_loss": 0.7527797818183899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 1.4760757168094275e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15908302.0, + "repeat_count": 0.0, + "routers_loss": 0.0009686833946034312, + "skip_count": 0.0, + "step": 9864, + "text_loss": 0.5548131465911865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 1.4686198566735531e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 15911923.0, + "repeat_count": 0.0, + "routers_loss": 0.0008255072170868516, + "skip_count": 0.0, + "step": 9866, + "text_loss": 0.5995872020721436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.4611825938540935e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15914858.0, + "repeat_count": 0.0, + "routers_loss": 0.002459712326526642, + "skip_count": 0.0, + "step": 9868, + "text_loss": 0.6777655482292175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 1.4537639312010298e-05, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 15918091.0, + "repeat_count": 0.0, + "routers_loss": 0.0014664786867797375, + "skip_count": 0.0, + "step": 9870, + "text_loss": 0.42750120162963867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.4463638715572103e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15920943.0, + "repeat_count": 1.0, + "routers_loss": 0.005549794062972069, + "skip_count": 1.0, + "step": 9872, + "text_loss": 0.27477580308914185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.4389824177583388e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15924212.0, + "repeat_count": 0.0, + "routers_loss": 0.007967505604028702, + "skip_count": 2.0, + "step": 9874, + "text_loss": 0.3174900412559509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 1.4316195726330139e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15929143.0, + "repeat_count": 0.0, + "routers_loss": 0.0014913028571754694, + "skip_count": 2.0, + "step": 9876, + "text_loss": 0.40919792652130127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 1.4242753390026953e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15931702.0, + "repeat_count": 0.0, + "routers_loss": 0.0003994424478150904, + "skip_count": 0.0, + "step": 9878, + "text_loss": 0.35346853733062744 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 1.4169497196816983e-05, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 15935225.0, + "repeat_count": 1.0, + "routers_loss": 0.008424114435911179, + "skip_count": 3.0, + "step": 9880, + "text_loss": 0.230825275182724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 1.4096427174772164e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15938630.0, + "repeat_count": 0.0, + "routers_loss": 0.004314251709729433, + "skip_count": 1.0, + "step": 9882, + "text_loss": 0.8749642968177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 1.4023543351893043e-05, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 15941779.0, + "repeat_count": 0.0, + "routers_loss": 0.0008999531855806708, + "skip_count": 0.0, + "step": 9884, + "text_loss": 0.6549318432807922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.3950845756108943e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15944779.0, + "repeat_count": 0.0, + "routers_loss": 0.0010829231468960643, + "skip_count": 0.0, + "step": 9886, + "text_loss": 0.5681273341178894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.3878334415277583e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15947757.0, + "repeat_count": 0.0, + "routers_loss": 0.0038863453082740307, + "skip_count": 1.0, + "step": 9888, + "text_loss": 0.4282133877277374 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 1.3806009357185512e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15952223.0, + "repeat_count": 1.0, + "routers_loss": 0.0006428947090171278, + "skip_count": 0.0, + "step": 9890, + "text_loss": 0.4455379247665405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 1.3733870609547838e-05, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 15955968.0, + "repeat_count": 0.0, + "routers_loss": 0.00048406270798295736, + "skip_count": 0.0, + "step": 9892, + "text_loss": 0.37554407119750977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 1.3661918200008228e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 15959376.0, + "repeat_count": 0.0, + "routers_loss": 0.004503594245761633, + "skip_count": 1.0, + "step": 9894, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.3590152156139012e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 15962882.0, + "repeat_count": 0.0, + "routers_loss": 0.0011738749453797936, + "skip_count": 0.0, + "step": 9896, + "text_loss": 0.4203954041004181 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.3518572505440973e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15965816.0, + "repeat_count": 1.0, + "routers_loss": 0.00806320272386074, + "skip_count": 2.0, + "step": 9898, + "text_loss": 0.18884631991386414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 1.3447179275343779e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15968840.0, + "repeat_count": 0.0, + "routers_loss": 0.004962162580341101, + "skip_count": 1.0, + "step": 9900, + "text_loss": 0.22457796335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.3375972493205268e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15972768.0, + "repeat_count": 0.0, + "routers_loss": 0.0025535912718623877, + "skip_count": 0.0, + "step": 9902, + "text_loss": 0.14859545230865479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 1.3304952186312114e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15975380.0, + "repeat_count": 0.0, + "routers_loss": 0.002036662772297859, + "skip_count": 0.0, + "step": 9904, + "text_loss": 0.5820382833480835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.3234118381879378e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 15978335.0, + "repeat_count": 0.0, + "routers_loss": 0.0055219330824911594, + "skip_count": 2.0, + "step": 9906, + "text_loss": 0.29671815037727356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.316347110705074e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15982003.0, + "repeat_count": 0.0, + "routers_loss": 0.005196230486035347, + "skip_count": 0.0, + "step": 9908, + "text_loss": 0.5204919576644897 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 1.3093010388898319e-05, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 15984937.0, + "repeat_count": 1.0, + "routers_loss": 0.0032779101748019457, + "skip_count": 2.0, + "step": 9910, + "text_loss": 0.6803483366966248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 1.3022736254422851e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15988992.0, + "repeat_count": 0.0, + "routers_loss": 0.002347869798541069, + "skip_count": 0.0, + "step": 9912, + "text_loss": 0.5335546731948853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 1.2952648730553462e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15992828.0, + "repeat_count": 0.0, + "routers_loss": 0.0011128517799079418, + "skip_count": 0.0, + "step": 9914, + "text_loss": 0.686739981174469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 1.288274784414789e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15995984.0, + "repeat_count": 0.0, + "routers_loss": 0.0031158174388110638, + "skip_count": 0.0, + "step": 9916, + "text_loss": 0.16102474927902222 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 1.2813033621992264e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15999606.0, + "repeat_count": 0.0, + "routers_loss": 0.0029228583443909883, + "skip_count": 1.0, + "step": 9918, + "text_loss": 0.6022558212280273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.274350609080116e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 16002456.0, + "repeat_count": 0.0, + "routers_loss": 0.0031404250767081976, + "skip_count": 2.0, + "step": 9920, + "text_loss": 0.7529577016830444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 1.2674165277217653e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 16005547.0, + "repeat_count": 0.0, + "routers_loss": 0.0038669302593916655, + "skip_count": 0.0, + "step": 9922, + "text_loss": 0.47488540410995483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 1.2605011207813378e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16009520.0, + "repeat_count": 0.0, + "routers_loss": 0.004838052671402693, + "skip_count": 0.0, + "step": 9924, + "text_loss": 0.5252779722213745 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 1.2536043909088191e-05, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 16012730.0, + "repeat_count": 1.0, + "routers_loss": 0.0017430823063477874, + "skip_count": 0.0, + "step": 9926, + "text_loss": 0.40845534205436707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 1.2467263407470619e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16015940.0, + "repeat_count": 0.0, + "routers_loss": 0.0010244545992463827, + "skip_count": 0.0, + "step": 9928, + "text_loss": 0.8465730547904968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 1.2398669729317357e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16018851.0, + "repeat_count": 0.0, + "routers_loss": 0.0007380630704574287, + "skip_count": 0.0, + "step": 9930, + "text_loss": 0.37603214383125305 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.629292632814796, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 1.2330262900913657e-05, + "loss": 0.0087, + "macro_f1": 0.9539539813995361, + "num_tokens": 16022351.0, + "repeat_count": 5.0, + "routers_loss": 0.053848277777433395, + "skip_count": 5.0, + "step": 9932, + "text_loss": 0.2047014981508255 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.2262042948473163e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16024902.0, + "repeat_count": 1.0, + "routers_loss": 0.0020845322869718075, + "skip_count": 0.0, + "step": 9934, + "text_loss": 0.6269918084144592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 1.2194009898137903e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 16028056.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686805376783013, + "skip_count": 0.0, + "step": 9936, + "text_loss": 0.4100899398326874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.212616377597825e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16032111.0, + "repeat_count": 0.0, + "routers_loss": 0.004883588291704655, + "skip_count": 3.0, + "step": 9938, + "text_loss": 0.3921346664428711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 1.2058504607993015e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 16035872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005067490856163204, + "skip_count": 0.0, + "step": 9940, + "text_loss": 0.44368258118629456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 1.1991032420109238e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 16038923.0, + "repeat_count": 0.0, + "routers_loss": 0.005819452460855246, + "skip_count": 2.0, + "step": 9942, + "text_loss": 0.27500197291374207 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.685647196947464, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.1923747238182403e-05, + "loss": 0.0059, + "macro_f1": 0.8817967176437378, + "num_tokens": 16041803.0, + "repeat_count": 2.0, + "routers_loss": 0.035794492810964584, + "skip_count": 3.0, + "step": 9944, + "text_loss": 0.5083543062210083 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.1856649087996384e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 16045258.0, + "repeat_count": 1.0, + "routers_loss": 0.002845201175659895, + "skip_count": 2.0, + "step": 9946, + "text_loss": 0.6859534382820129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.1789737995263228e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 16048618.0, + "repeat_count": 0.0, + "routers_loss": 0.0007575460476800799, + "skip_count": 0.0, + "step": 9948, + "text_loss": 0.4512535333633423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 1.1723013985623477e-05, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 16051595.0, + "repeat_count": 0.0, + "routers_loss": 0.002697878750041127, + "skip_count": 1.0, + "step": 9950, + "text_loss": 0.3572070300579071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 1.16564770846459e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 16054494.0, + "repeat_count": 0.0, + "routers_loss": 0.0062429774552583694, + "skip_count": 1.0, + "step": 9952, + "text_loss": 0.5479834079742432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 1.1590127317827492e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 16057555.0, + "repeat_count": 0.0, + "routers_loss": 0.0009302232647314668, + "skip_count": 0.0, + "step": 9954, + "text_loss": 0.44800761342048645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.1523964710593637e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 16061072.0, + "repeat_count": 0.0, + "routers_loss": 0.002112898975610733, + "skip_count": 0.0, + "step": 9956, + "text_loss": 0.3274081349372864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 1.1457989288297942e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 16064165.0, + "repeat_count": 0.0, + "routers_loss": 0.00028447998920455575, + "skip_count": 0.0, + "step": 9958, + "text_loss": 0.5712385773658752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 1.1392201076222352e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 16067293.0, + "repeat_count": 1.0, + "routers_loss": 0.009599249809980392, + "skip_count": 2.0, + "step": 9960, + "text_loss": 0.26818037033081055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 1.132660009957709e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 16069852.0, + "repeat_count": 0.0, + "routers_loss": 0.005338563583791256, + "skip_count": 0.0, + "step": 9962, + "text_loss": 0.6658869981765747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 1.1261186383500487e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16072633.0, + "repeat_count": 0.0, + "routers_loss": 0.001175224082544446, + "skip_count": 1.0, + "step": 9964, + "text_loss": 0.4461731016635895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.1195959953059221e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16076065.0, + "repeat_count": 1.0, + "routers_loss": 0.0036650802940130234, + "skip_count": 0.0, + "step": 9966, + "text_loss": 0.6107141971588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 1.113092083324818e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 16079309.0, + "repeat_count": 0.0, + "routers_loss": 0.005924097262322903, + "skip_count": 2.0, + "step": 9968, + "text_loss": 0.5104627013206482 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 46.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 1.1066069048990545e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 16082180.0, + "repeat_count": 3.0, + "routers_loss": 0.010777595452964306, + "skip_count": 0.0, + "step": 9970, + "text_loss": 0.5205907225608826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 1.100140462513749e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 16084654.0, + "repeat_count": 0.0, + "routers_loss": 0.0019593914039433002, + "skip_count": 0.0, + "step": 9972, + "text_loss": 0.36411789059638977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 46.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0255126953125, + "learning_rate": 1.0936927586468693e-05, + "loss": 0.0048, + "macro_f1": 0.9452888369560242, + "num_tokens": 16087736.0, + "repeat_count": 1.0, + "routers_loss": 0.0233579371124506, + "skip_count": 4.0, + "step": 9974, + "text_loss": 0.267604261636734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 1.0872637957691833e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 16090838.0, + "repeat_count": 0.0, + "routers_loss": 0.00034629934816621244, + "skip_count": 0.0, + "step": 9976, + "text_loss": 0.576068103313446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 1.0808535763442761e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16094084.0, + "repeat_count": 0.0, + "routers_loss": 0.0004253332444932312, + "skip_count": 0.0, + "step": 9978, + "text_loss": 0.5883988738059998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 1.0744621028285662e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16097432.0, + "repeat_count": 0.0, + "routers_loss": 0.0005800648941658437, + "skip_count": 0.0, + "step": 9980, + "text_loss": 0.3358926475048065 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 1.068089377671272e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 16100711.0, + "repeat_count": 1.0, + "routers_loss": 0.0015245937975123525, + "skip_count": 0.0, + "step": 9982, + "text_loss": 0.6802405714988708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.061735403314429e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 16103952.0, + "repeat_count": 0.0, + "routers_loss": 0.002281307242810726, + "skip_count": 1.0, + "step": 9984, + "text_loss": 0.3086298406124115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.055400182192906e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 16107101.0, + "repeat_count": 0.0, + "routers_loss": 0.0007910717977210879, + "skip_count": 0.0, + "step": 9986, + "text_loss": 0.7036139965057373 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 1.0490837167343559e-05, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 16110316.0, + "repeat_count": 1.0, + "routers_loss": 0.0030006880406290293, + "skip_count": 1.0, + "step": 9988, + "text_loss": 0.4638058841228485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 1.04278600935927e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16113206.0, + "repeat_count": 0.0, + "routers_loss": 0.0006434856331907213, + "skip_count": 0.0, + "step": 9990, + "text_loss": 0.6155068874359131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 1.0365070624809403e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16116098.0, + "repeat_count": 0.0, + "routers_loss": 0.0007891099085099995, + "skip_count": 0.0, + "step": 9992, + "text_loss": 0.4537872076034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.92045788083358, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 1.0302468785054641e-05, + "loss": 0.0054, + "macro_f1": 0.8823530077934265, + "num_tokens": 16119344.0, + "repeat_count": 2.0, + "routers_loss": 0.011918486095964909, + "skip_count": 1.0, + "step": 9994, + "text_loss": 0.18828579783439636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 1.0240054598317672e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 16122615.0, + "repeat_count": 1.0, + "routers_loss": 0.016306765377521515, + "skip_count": 2.0, + "step": 9996, + "text_loss": 0.2876183092594147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 1.0177828088515694e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 16125506.0, + "repeat_count": 0.0, + "routers_loss": 0.00393108231946826, + "skip_count": 1.0, + "step": 9998, + "text_loss": 0.6387818455696106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 1.011578927949397e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16128499.0, + "repeat_count": 0.0, + "routers_loss": 0.001175055862404406, + "skip_count": 0.0, + "step": 10000, + "text_loss": 0.4085952639579773 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.738398356854296e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10000/training_args.bin b/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880 diff --git a/checkpoint-10650/chat_template.jinja b/checkpoint-10650/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-10650/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-10650/config.json b/checkpoint-10650/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd --- /dev/null +++ b/checkpoint-10650/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-10650/generation_config.json b/checkpoint-10650/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35 --- /dev/null +++ b/checkpoint-10650/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.55.2" +} diff --git a/checkpoint-10650/model-00001-of-00002.safetensors b/checkpoint-10650/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284 --- /dev/null +++ b/checkpoint-10650/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-10650/model-00002-of-00002.safetensors b/checkpoint-10650/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee28c0dc703eb09f36a601d56c971edb4d4406e3 --- /dev/null +++ b/checkpoint-10650/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb777b656b02eca5ed71a1eec21997465d360e38b5311d83b9de66d34fc2ff9 +size 1481790520 diff --git a/checkpoint-10650/model.safetensors.index.json b/checkpoint-10650/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-10650/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-10650/optimizer.pt b/checkpoint-10650/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..03fc5bdc59b880ca8ba9833a5e2f8651d4e107f8 --- /dev/null +++ b/checkpoint-10650/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dd7a38e033ea8dacacb991c36d2d46a2a9f889893d9c26efeebbe35465e69be +size 44191162 diff --git a/checkpoint-10650/rng_state.pth b/checkpoint-10650/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc552cae08c3af1de204610a293370696f1faaaa --- /dev/null +++ b/checkpoint-10650/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed7c9f18c0606d7eb2f3d6cfe5e71f033f5f69b5a6aa170ad2ff926625abd40 +size 14244 diff --git a/checkpoint-10650/scheduler.pt b/checkpoint-10650/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83c64bebb094a313f6a10e493f9f55c8793e86c7 --- /dev/null +++ b/checkpoint-10650/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5faea1e19b92c75b4859c50baf8e943951af0ec0dc6f6201e9523b77f93deb7 +size 1064 diff --git a/checkpoint-10650/special_tokens_map.json b/checkpoint-10650/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-10650/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-10650/tokenizer.json b/checkpoint-10650/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-10650/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-10650/tokenizer_config.json b/checkpoint-10650/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-10650/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-10650/trainer_state.json b/checkpoint-10650/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d8c0f461c7295ab49302e1afeca1ee6024cd459 --- /dev/null +++ b/checkpoint-10650/trainer_state.json @@ -0,0 +1,101209 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 50.0, + "eval_steps": 500, + "global_step": 10650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 28.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00043426384465025604, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9682677.0, + "repeat_count": 2.0, + "routers_loss": 0.0011284075444564223, + "skip_count": 0.0, + "step": 6002, + "text_loss": 0.28305181860923767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.000433957027399272, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9685310.0, + "repeat_count": 0.0, + "routers_loss": 0.0030473743099719286, + "skip_count": 1.0, + "step": 6004, + "text_loss": 0.3650054931640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00043365023545607965, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9687944.0, + "repeat_count": 1.0, + "routers_loss": 0.011621905490756035, + "skip_count": 2.0, + "step": 6006, + "text_loss": 0.5409000515937805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004333434689382423, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 9690932.0, + "repeat_count": 0.0, + "routers_loss": 0.0005297541501931846, + "skip_count": 0.0, + "step": 6008, + "text_loss": 0.4311029314994812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.216025829175226, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00043303672796331336, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9693972.0, + "repeat_count": 1.0, + "routers_loss": 0.06166421249508858, + "skip_count": 0.0, + "step": 6010, + "text_loss": 0.2658997178077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00043273001264883655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9697712.0, + "repeat_count": 0.0, + "routers_loss": 0.0018419031985104084, + "skip_count": 0.0, + "step": 6012, + "text_loss": 0.5813497304916382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004324233231123458, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9700746.0, + "repeat_count": 0.0, + "routers_loss": 0.003635555040091276, + "skip_count": 0.0, + "step": 6014, + "text_loss": 0.24211904406547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 28.24420311124156, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004321166594713651, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 9704087.0, + "repeat_count": 0.0, + "routers_loss": 0.021067705005407333, + "skip_count": 2.0, + "step": 6016, + "text_loss": 0.5908042788505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00043181002184340857, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9708695.0, + "repeat_count": 0.0, + "routers_loss": 0.0008712753187865019, + "skip_count": 0.0, + "step": 6018, + "text_loss": 0.7788549661636353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004315034103459803, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 9711631.0, + "repeat_count": 1.0, + "routers_loss": 0.03231092542409897, + "skip_count": 0.0, + "step": 6020, + "text_loss": 0.6127741932868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004311968250965743, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9715526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020149527117609978, + "skip_count": 2.0, + "step": 6022, + "text_loss": 0.49970078468322754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004308902662126748, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9718475.0, + "repeat_count": 0.0, + "routers_loss": 0.0031795913819223642, + "skip_count": 0.0, + "step": 6024, + "text_loss": 0.3254713714122772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00043058373381175567, + "loss": 0.004, + "macro_f1": 0.3272727429866791, + "num_tokens": 9722194.0, + "repeat_count": 0.0, + "routers_loss": 0.0148378387093544, + "skip_count": 1.0, + "step": 6026, + "text_loss": 0.17670343816280365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0004302772280112806, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 9725489.0, + "repeat_count": 1.0, + "routers_loss": 0.005742347799241543, + "skip_count": 2.0, + "step": 6028, + "text_loss": 0.26184776425361633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00042997074892870335, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9729416.0, + "repeat_count": 0.0, + "routers_loss": 0.0023561837151646614, + "skip_count": 0.0, + "step": 6030, + "text_loss": 0.3026008605957031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0004296642966814673, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9732559.0, + "repeat_count": 0.0, + "routers_loss": 0.0010108393616974354, + "skip_count": 1.0, + "step": 6032, + "text_loss": 0.43198078870773315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00042935787138700525, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 9736324.0, + "repeat_count": 2.0, + "routers_loss": 0.005443581845611334, + "skip_count": 2.0, + "step": 6034, + "text_loss": 0.24883155524730682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0004290514731627403, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 9739630.0, + "repeat_count": 1.0, + "routers_loss": 0.010645060800015926, + "skip_count": 2.0, + "step": 6036, + "text_loss": 0.24207182228565216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.0004287451021260846, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9742221.0, + "repeat_count": 0.0, + "routers_loss": 0.0008162845042534173, + "skip_count": 0.0, + "step": 6038, + "text_loss": 0.33018553256988525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004284387583944403, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9744925.0, + "repeat_count": 0.0, + "routers_loss": 0.003782407147809863, + "skip_count": 1.0, + "step": 6040, + "text_loss": 0.6600399613380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0004281324420851987, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9748103.0, + "repeat_count": 0.0, + "routers_loss": 0.0009834285592660308, + "skip_count": 0.0, + "step": 6042, + "text_loss": 0.6402350664138794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0004278261533157409, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9751128.0, + "repeat_count": 0.0, + "routers_loss": 0.004100334830582142, + "skip_count": 2.0, + "step": 6044, + "text_loss": 0.1545136719942093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0004275198922034372, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 9754140.0, + "repeat_count": 0.0, + "routers_loss": 0.0017166603356599808, + "skip_count": 1.0, + "step": 6046, + "text_loss": 0.5875935554504395 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042721365886564766, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9756945.0, + "repeat_count": 1.0, + "routers_loss": 0.00915827602148056, + "skip_count": 2.0, + "step": 6048, + "text_loss": 0.3885214328765869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00042690745341972134, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9759738.0, + "repeat_count": 0.0, + "routers_loss": 0.0057020667009055614, + "skip_count": 2.0, + "step": 6050, + "text_loss": 0.3107164204120636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00042660127598299647, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9762987.0, + "repeat_count": 0.0, + "routers_loss": 0.004196313209831715, + "skip_count": 2.0, + "step": 6052, + "text_loss": 0.3073577582836151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00042629512667280135, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 9765828.0, + "repeat_count": 0.0, + "routers_loss": 0.0023119752295315266, + "skip_count": 1.0, + "step": 6054, + "text_loss": 0.8228643536567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004259890056064527, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 9769129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021007524337619543, + "skip_count": 1.0, + "step": 6056, + "text_loss": 0.8334706425666809 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004256829129012568, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9771821.0, + "repeat_count": 1.0, + "routers_loss": 0.00671970471739769, + "skip_count": 2.0, + "step": 6058, + "text_loss": 0.17845536768436432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00042537684867450875, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9774566.0, + "repeat_count": 0.0, + "routers_loss": 0.0014770646812394261, + "skip_count": 0.0, + "step": 6060, + "text_loss": 0.4445459246635437 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.46022894041679, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00042507081304349315, + "loss": 0.0067, + "macro_f1": 0.5492662787437439, + "num_tokens": 9777909.0, + "repeat_count": 2.0, + "routers_loss": 0.014822427183389664, + "skip_count": 0.0, + "step": 6062, + "text_loss": 0.45526158809661865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004247648061254833, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9781159.0, + "repeat_count": 0.0, + "routers_loss": 0.00568385748192668, + "skip_count": 1.0, + "step": 6064, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.479013795127678, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00042445882803774173, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 9784960.0, + "repeat_count": 1.0, + "routers_loss": 0.0179694052785635, + "skip_count": 0.0, + "step": 6066, + "text_loss": 0.23591181635856628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00042415287889751966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9787941.0, + "repeat_count": 0.0, + "routers_loss": 0.0019039154285565019, + "skip_count": 0.0, + "step": 6068, + "text_loss": 0.9447930455207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004238469588220575, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9791096.0, + "repeat_count": 0.0, + "routers_loss": 0.004039563238620758, + "skip_count": 0.0, + "step": 6070, + "text_loss": 0.3134256601333618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00042354106792858446, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9794082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018352365586906672, + "skip_count": 0.0, + "step": 6072, + "text_loss": 0.5681536197662354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00042323520633431833, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9797303.0, + "repeat_count": 0.0, + "routers_loss": 0.0019325513858348131, + "skip_count": 0.0, + "step": 6074, + "text_loss": 0.2835809290409088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00042292937415646574, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9800435.0, + "repeat_count": 0.0, + "routers_loss": 0.002513401210308075, + "skip_count": 0.0, + "step": 6076, + "text_loss": 0.1931663602590561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00042262357151222265, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9803873.0, + "repeat_count": 0.0, + "routers_loss": 0.004864581860601902, + "skip_count": 0.0, + "step": 6078, + "text_loss": 0.25809767842292786 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004223177985187728, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9806438.0, + "repeat_count": 1.0, + "routers_loss": 0.004932792857289314, + "skip_count": 0.0, + "step": 6080, + "text_loss": 0.6409249305725098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00042201205529328925, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9809400.0, + "repeat_count": 0.0, + "routers_loss": 0.00590938376262784, + "skip_count": 1.0, + "step": 6082, + "text_loss": 0.31158050894737244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00042170634195293314, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9813246.0, + "repeat_count": 0.0, + "routers_loss": 0.006805860437452793, + "skip_count": 0.0, + "step": 6084, + "text_loss": 0.32945963740348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004214006586148545, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9816513.0, + "repeat_count": 0.0, + "routers_loss": 0.0010186503641307354, + "skip_count": 0.0, + "step": 6086, + "text_loss": 0.48659923672676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0004210950053961917, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9819908.0, + "repeat_count": 0.0, + "routers_loss": 0.00402973173186183, + "skip_count": 1.0, + "step": 6088, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00042078938241407174, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9822950.0, + "repeat_count": 0.0, + "routers_loss": 0.00236532068811357, + "skip_count": 1.0, + "step": 6090, + "text_loss": 0.26589256525039673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004204837897856098, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9826493.0, + "repeat_count": 1.0, + "routers_loss": 0.003072192659601569, + "skip_count": 2.0, + "step": 6092, + "text_loss": 0.5216912627220154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004201782276279096, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9829698.0, + "repeat_count": 0.0, + "routers_loss": 0.0027553171385079622, + "skip_count": 1.0, + "step": 6094, + "text_loss": 0.40127676725387573 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.61990020545935, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00041987269605806325, + "loss": 0.0045, + "macro_f1": 0.9442509412765503, + "num_tokens": 9833719.0, + "repeat_count": 4.0, + "routers_loss": 0.013845407404005527, + "skip_count": 4.0, + "step": 6096, + "text_loss": 0.23114071786403656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004195671951931509, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 9838235.0, + "repeat_count": 0.0, + "routers_loss": 0.0019887303933501244, + "skip_count": 2.0, + "step": 6098, + "text_loss": 0.7467341423034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004192617251502409, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9840867.0, + "repeat_count": 0.0, + "routers_loss": 0.0007213905337266624, + "skip_count": 0.0, + "step": 6100, + "text_loss": 0.6283472180366516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00041895628604639036, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9843827.0, + "repeat_count": 0.0, + "routers_loss": 0.003863139310851693, + "skip_count": 1.0, + "step": 6102, + "text_loss": 0.3602744936943054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00041865087799864374, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9846939.0, + "repeat_count": 0.0, + "routers_loss": 0.0013336286647245288, + "skip_count": 0.0, + "step": 6104, + "text_loss": 0.4182434678077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0004183455011240341, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 9849827.0, + "repeat_count": 0.0, + "routers_loss": 0.00038455065805464983, + "skip_count": 0.0, + "step": 6106, + "text_loss": 0.7122722864151001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 28.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004180401555395826, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 9853487.0, + "repeat_count": 3.0, + "routers_loss": 0.0038226440083235502, + "skip_count": 1.0, + "step": 6108, + "text_loss": 0.2521185576915741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004177348413622981, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9856321.0, + "repeat_count": 0.0, + "routers_loss": 0.0015809801407158375, + "skip_count": 0.0, + "step": 6110, + "text_loss": 0.423979252576828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004174295587091776, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9859238.0, + "repeat_count": 0.0, + "routers_loss": 0.0007586454739794135, + "skip_count": 0.0, + "step": 6112, + "text_loss": 0.4720100462436676 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00041712430769720593, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 9862282.0, + "repeat_count": 1.0, + "routers_loss": 0.0045816488564014435, + "skip_count": 1.0, + "step": 6114, + "text_loss": 0.279577374458313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004168190884433559, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 9865394.0, + "repeat_count": 1.0, + "routers_loss": 0.004728195257484913, + "skip_count": 1.0, + "step": 6116, + "text_loss": 0.3826395571231842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0004165139010645881, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9869165.0, + "repeat_count": 0.0, + "routers_loss": 0.006160226184874773, + "skip_count": 3.0, + "step": 6118, + "text_loss": 0.4668935537338257 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 28.732609333724685, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004162087456778509, + "loss": 0.0074, + "macro_f1": 0.9619450569152832, + "num_tokens": 9872381.0, + "repeat_count": 1.0, + "routers_loss": 0.027831824496388435, + "skip_count": 6.0, + "step": 6120, + "text_loss": 0.28708913922309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004159036224000804, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9875668.0, + "repeat_count": 0.0, + "routers_loss": 0.0030764432158321142, + "skip_count": 1.0, + "step": 6122, + "text_loss": 0.37078607082366943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004155985313482002, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9878533.0, + "repeat_count": 0.0, + "routers_loss": 0.00043521137558855116, + "skip_count": 0.0, + "step": 6124, + "text_loss": 0.34975379705429077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00041529347263912224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9881478.0, + "repeat_count": 0.0, + "routers_loss": 0.0016251741908490658, + "skip_count": 0.0, + "step": 6126, + "text_loss": 0.39166271686553955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00041498844638974535, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 9884252.0, + "repeat_count": 1.0, + "routers_loss": 0.019553523510694504, + "skip_count": 0.0, + "step": 6128, + "text_loss": 0.2309480905532837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004146834527169562, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9887485.0, + "repeat_count": 1.0, + "routers_loss": 0.0036251386627554893, + "skip_count": 0.0, + "step": 6130, + "text_loss": 0.4464457631111145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00041437849173762894, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9890711.0, + "repeat_count": 0.0, + "routers_loss": 0.0008515548543073237, + "skip_count": 0.0, + "step": 6132, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004140735635686251, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9894458.0, + "repeat_count": 1.0, + "routers_loss": 0.001084602321498096, + "skip_count": 0.0, + "step": 6134, + "text_loss": 0.32015663385391235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004137686683267938, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9897634.0, + "repeat_count": 0.0, + "routers_loss": 0.0025203595869243145, + "skip_count": 0.0, + "step": 6136, + "text_loss": 0.15804508328437805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0004134638061289715, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9901157.0, + "repeat_count": 0.0, + "routers_loss": 0.0029381231870502234, + "skip_count": 0.0, + "step": 6138, + "text_loss": 0.14375236630439758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0004131589770919819, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9903958.0, + "repeat_count": 0.0, + "routers_loss": 0.002789110178127885, + "skip_count": 0.0, + "step": 6140, + "text_loss": 0.2474033683538437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004128541813326361, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9906799.0, + "repeat_count": 2.0, + "routers_loss": 0.010770512744784355, + "skip_count": 3.0, + "step": 6142, + "text_loss": 0.2304249256849289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004125494189677325, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 9909286.0, + "repeat_count": 1.0, + "routers_loss": 0.003122122259810567, + "skip_count": 0.0, + "step": 6144, + "text_loss": 0.3781827688217163 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00041224469011405643, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9912416.0, + "repeat_count": 1.0, + "routers_loss": 0.008443298749625683, + "skip_count": 1.0, + "step": 6146, + "text_loss": 0.3004767596721649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004119399948883806, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9915290.0, + "repeat_count": 0.0, + "routers_loss": 0.0033219947945326567, + "skip_count": 1.0, + "step": 6148, + "text_loss": 0.748744547367096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0004116353334074647, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9918493.0, + "repeat_count": 1.0, + "routers_loss": 0.005501769948750734, + "skip_count": 0.0, + "step": 6150, + "text_loss": 0.330759733915329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.000411330705788056, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9921027.0, + "repeat_count": 0.0, + "routers_loss": 0.0013694261433556676, + "skip_count": 0.0, + "step": 6152, + "text_loss": 0.43070924282073975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.000411026112146888, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9924303.0, + "repeat_count": 0.0, + "routers_loss": 0.00046192589798010886, + "skip_count": 0.0, + "step": 6154, + "text_loss": 0.5674887895584106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004107215526006817, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9927065.0, + "repeat_count": 1.0, + "routers_loss": 0.004311304073780775, + "skip_count": 0.0, + "step": 6156, + "text_loss": 0.16138267517089844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004104170272661449, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9930713.0, + "repeat_count": 0.0, + "routers_loss": 0.0035845425445586443, + "skip_count": 0.0, + "step": 6158, + "text_loss": 0.18728356063365936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00041011253625997227, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9934393.0, + "repeat_count": 0.0, + "routers_loss": 0.00247366214171052, + "skip_count": 0.0, + "step": 6160, + "text_loss": 0.3624019920825958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004098080796988452, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9937457.0, + "repeat_count": 0.0, + "routers_loss": 0.003240241203457117, + "skip_count": 0.0, + "step": 6162, + "text_loss": 0.12348521500825882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0004095036576994321, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 9940523.0, + "repeat_count": 0.0, + "routers_loss": 0.001985874492675066, + "skip_count": 1.0, + "step": 6164, + "text_loss": 0.2688066363334656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00040919927037838815, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9943802.0, + "repeat_count": 0.0, + "routers_loss": 0.004264154937118292, + "skip_count": 3.0, + "step": 6166, + "text_loss": 0.49316367506980896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00040889491785235513, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9946649.0, + "repeat_count": 0.0, + "routers_loss": 0.002545441733673215, + "skip_count": 0.0, + "step": 6168, + "text_loss": 0.4079313576221466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004085906002379614, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9949800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009590961271896958, + "skip_count": 0.0, + "step": 6170, + "text_loss": 0.6166561245918274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004082863176518221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9954008.0, + "repeat_count": 0.0, + "routers_loss": 0.003795337164774537, + "skip_count": 2.0, + "step": 6172, + "text_loss": 0.4791361689567566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0004079820702105388, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9957153.0, + "repeat_count": 0.0, + "routers_loss": 0.0015634822193533182, + "skip_count": 0.0, + "step": 6174, + "text_loss": 0.7208777666091919 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.995597299677137, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004076778580306999, + "loss": 0.0056, + "macro_f1": 0.8820862174034119, + "num_tokens": 9960060.0, + "repeat_count": 2.0, + "routers_loss": 0.03223998099565506, + "skip_count": 2.0, + "step": 6176, + "text_loss": 0.6617992520332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00040737368122887983, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9963396.0, + "repeat_count": 0.0, + "routers_loss": 0.0033978577703237534, + "skip_count": 0.0, + "step": 6178, + "text_loss": 0.7339215278625488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00040706953992164, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9966364.0, + "repeat_count": 0.0, + "routers_loss": 0.0005358994239941239, + "skip_count": 0.0, + "step": 6180, + "text_loss": 0.44187214970588684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040676543422552767, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9969813.0, + "repeat_count": 0.0, + "routers_loss": 0.0018544091144576669, + "skip_count": 1.0, + "step": 6182, + "text_loss": 0.6244927048683167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004064613642570769, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9973015.0, + "repeat_count": 0.0, + "routers_loss": 0.005692692007869482, + "skip_count": 0.0, + "step": 6184, + "text_loss": 0.18860043585300446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00040615733013280784, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9976201.0, + "repeat_count": 0.0, + "routers_loss": 0.0018737476784735918, + "skip_count": 0.0, + "step": 6186, + "text_loss": 0.21189232170581818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00040585333196922687, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9979711.0, + "repeat_count": 0.0, + "routers_loss": 0.011945146135985851, + "skip_count": 2.0, + "step": 6188, + "text_loss": 0.2628154456615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00040554936988282663, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9983003.0, + "repeat_count": 0.0, + "routers_loss": 0.0036045778542757034, + "skip_count": 1.0, + "step": 6190, + "text_loss": 0.5926038026809692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0004052454439900861, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9986841.0, + "repeat_count": 0.0, + "routers_loss": 0.004170368425548077, + "skip_count": 0.0, + "step": 6192, + "text_loss": 0.3088737726211548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00040494155440747015, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9989596.0, + "repeat_count": 0.0, + "routers_loss": 0.002254750579595566, + "skip_count": 2.0, + "step": 6194, + "text_loss": 0.6309700012207031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.089228059876724, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00040463770125142987, + "loss": 0.0087, + "macro_f1": 0.8814815282821655, + "num_tokens": 9992789.0, + "repeat_count": 2.0, + "routers_loss": 0.04092822223901749, + "skip_count": 4.0, + "step": 6196, + "text_loss": 0.09625697880983353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00040433388463840213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 9995782.0, + "repeat_count": 0.0, + "routers_loss": 0.00029065192211419344, + "skip_count": 0.0, + "step": 6198, + "text_loss": 0.5600258111953735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004040301046848105, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9998712.0, + "repeat_count": 0.0, + "routers_loss": 0.0005865268758498132, + "skip_count": 0.0, + "step": 6200, + "text_loss": 0.6426429748535156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.11740534194306, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0283203125, + "learning_rate": 0.0004037263615070638, + "loss": 0.0078, + "macro_f1": 0.9265305995941162, + "num_tokens": 10002020.0, + "repeat_count": 1.0, + "routers_loss": 0.025357060134410858, + "skip_count": 3.0, + "step": 6202, + "text_loss": 0.25125735998153687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000403422655221557, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10005381.0, + "repeat_count": 0.0, + "routers_loss": 0.003139561740681529, + "skip_count": 1.0, + "step": 6204, + "text_loss": 0.3639419376850128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00040311898594467085, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10008348.0, + "repeat_count": 0.0, + "routers_loss": 0.004091196693480015, + "skip_count": 2.0, + "step": 6206, + "text_loss": 0.1602363884449005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040281535379277204, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10011171.0, + "repeat_count": 0.0, + "routers_loss": 0.005771483760327101, + "skip_count": 0.0, + "step": 6208, + "text_loss": 0.5593504905700684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000402511758882213, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10014374.0, + "repeat_count": 0.0, + "routers_loss": 0.005212264601141214, + "skip_count": 1.0, + "step": 6210, + "text_loss": 0.15668229758739471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004022082013293319, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10017327.0, + "repeat_count": 0.0, + "routers_loss": 0.0027585842180997133, + "skip_count": 1.0, + "step": 6212, + "text_loss": 0.21188466250896454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.173759906075727, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00040190468125045255, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10020518.0, + "repeat_count": 0.0, + "routers_loss": 0.013210589066147804, + "skip_count": 1.0, + "step": 6214, + "text_loss": 0.2551073729991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00040160119876188436, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10023799.0, + "repeat_count": 1.0, + "routers_loss": 0.001590219559147954, + "skip_count": 0.0, + "step": 6216, + "text_loss": 0.5634782314300537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004012977539799224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 10027107.0, + "repeat_count": 0.0, + "routers_loss": 0.003917343448847532, + "skip_count": 0.0, + "step": 6218, + "text_loss": 0.6412819027900696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004009943470208473, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 10030460.0, + "repeat_count": 0.0, + "routers_loss": 0.00874288845807314, + "skip_count": 2.0, + "step": 6220, + "text_loss": 0.13269923627376556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.000400690978000925, + "loss": 0.0075, + "macro_f1": 0.8817967176437378, + "num_tokens": 10034086.0, + "repeat_count": 2.0, + "routers_loss": 0.03736349940299988, + "skip_count": 3.0, + "step": 6222, + "text_loss": 0.4956454336643219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004003876470364075, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10037312.0, + "repeat_count": 0.0, + "routers_loss": 0.008481289260089397, + "skip_count": 2.0, + "step": 6224, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0004000843542435315, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 10040393.0, + "repeat_count": 0.0, + "routers_loss": 0.002235144842416048, + "skip_count": 0.0, + "step": 6226, + "text_loss": 0.17645306885242462 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003997810997385195, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10044386.0, + "repeat_count": 1.0, + "routers_loss": 0.004541373811662197, + "skip_count": 0.0, + "step": 6228, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00039947788363757915, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 10049046.0, + "repeat_count": 0.0, + "routers_loss": 0.0019183673430234194, + "skip_count": 1.0, + "step": 6230, + "text_loss": 0.6953724026679993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00039917470605690334, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 10051787.0, + "repeat_count": 2.0, + "routers_loss": 0.0032311067916452885, + "skip_count": 4.0, + "step": 6232, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.267684179630173, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00039887156711267043, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 10055396.0, + "repeat_count": 2.0, + "routers_loss": 0.03247373178601265, + "skip_count": 0.0, + "step": 6234, + "text_loss": 0.4239100515842438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00039856846692104363, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10058395.0, + "repeat_count": 0.0, + "routers_loss": 0.006287421099841595, + "skip_count": 3.0, + "step": 6236, + "text_loss": 0.24084535241127014 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.0003982654055981718, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10061302.0, + "repeat_count": 1.0, + "routers_loss": 0.0008686117362231016, + "skip_count": 1.0, + "step": 6238, + "text_loss": 0.4740419089794159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0003979623832601884, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10065318.0, + "repeat_count": 0.0, + "routers_loss": 0.0037686119321733713, + "skip_count": 2.0, + "step": 6240, + "text_loss": 0.43965795636177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0003976594000232123, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10068291.0, + "repeat_count": 0.0, + "routers_loss": 0.005804901942610741, + "skip_count": 0.0, + "step": 6242, + "text_loss": 0.24424348771572113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00039735645600334714, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10071645.0, + "repeat_count": 0.0, + "routers_loss": 0.002001055981963873, + "skip_count": 1.0, + "step": 6244, + "text_loss": 0.6524377465248108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0003970535513166815, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10075136.0, + "repeat_count": 0.0, + "routers_loss": 0.001252001617103815, + "skip_count": 0.0, + "step": 6246, + "text_loss": 0.22803714871406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0003967506860792893, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10078230.0, + "repeat_count": 0.0, + "routers_loss": 0.004913780372589827, + "skip_count": 1.0, + "step": 6248, + "text_loss": 0.9835516214370728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.000396447860407229, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10080852.0, + "repeat_count": 0.0, + "routers_loss": 0.0037437966093420982, + "skip_count": 2.0, + "step": 6250, + "text_loss": 0.4021640121936798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00039614507441654393, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10084139.0, + "repeat_count": 0.0, + "routers_loss": 0.005433002021163702, + "skip_count": 2.0, + "step": 6252, + "text_loss": 0.23060470819473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00039584232822326224, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10088501.0, + "repeat_count": 0.0, + "routers_loss": 0.0007705377647653222, + "skip_count": 0.0, + "step": 6254, + "text_loss": 0.5994830131530762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003955396219433969, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10091506.0, + "repeat_count": 0.0, + "routers_loss": 0.0012310115853324533, + "skip_count": 0.0, + "step": 6256, + "text_loss": 0.4639038145542145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0003952369556929455, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10096236.0, + "repeat_count": 0.0, + "routers_loss": 0.008964627049863338, + "skip_count": 2.0, + "step": 6258, + "text_loss": 0.24845287203788757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003949343295878903, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10099213.0, + "repeat_count": 0.0, + "routers_loss": 0.0033088945783674717, + "skip_count": 0.0, + "step": 6260, + "text_loss": 0.6527073979377747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00039463174374419817, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10103160.0, + "repeat_count": 2.0, + "routers_loss": 0.003462672932073474, + "skip_count": 1.0, + "step": 6262, + "text_loss": 0.4209299683570862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00039432919827782066, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10105881.0, + "repeat_count": 2.0, + "routers_loss": 0.0027124532498419285, + "skip_count": 2.0, + "step": 6264, + "text_loss": 0.4442266821861267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00039402669330469367, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10108596.0, + "repeat_count": 0.0, + "routers_loss": 0.005055282264947891, + "skip_count": 2.0, + "step": 6266, + "text_loss": 0.3331456780433655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00039372422894073765, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10111673.0, + "repeat_count": 0.0, + "routers_loss": 0.0009340311517007649, + "skip_count": 0.0, + "step": 6268, + "text_loss": 0.7664456367492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00039342180530185745, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10116141.0, + "repeat_count": 0.0, + "routers_loss": 0.00032052272581495345, + "skip_count": 0.0, + "step": 6270, + "text_loss": 0.47610244154930115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00039311942250394274, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10119151.0, + "repeat_count": 0.0, + "routers_loss": 0.0015820999396964908, + "skip_count": 0.0, + "step": 6272, + "text_loss": 0.3815282881259918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003928170806628669, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10122684.0, + "repeat_count": 0.0, + "routers_loss": 0.0007423736387863755, + "skip_count": 0.0, + "step": 6274, + "text_loss": 0.4630914628505707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00039251477989448797, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10126751.0, + "repeat_count": 0.0, + "routers_loss": 0.0006216703332029283, + "skip_count": 0.0, + "step": 6276, + "text_loss": 0.4342454671859741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.00039221252031464816, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10129784.0, + "repeat_count": 0.0, + "routers_loss": 0.004239698871970177, + "skip_count": 3.0, + "step": 6278, + "text_loss": 0.24661089479923248 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 29.4837100088054, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003919103020391738, + "loss": 0.006, + "macro_f1": 0.8803418874740601, + "num_tokens": 10133066.0, + "repeat_count": 2.0, + "routers_loss": 0.027879100292921066, + "skip_count": 7.0, + "step": 6280, + "text_loss": 0.4705188274383545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00039160812518387574, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 10136860.0, + "repeat_count": 0.0, + "routers_loss": 0.002533538034185767, + "skip_count": 0.0, + "step": 6282, + "text_loss": 0.1953880786895752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00039130598986454845, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 10140066.0, + "repeat_count": 1.0, + "routers_loss": 0.002462630858644843, + "skip_count": 2.0, + "step": 6284, + "text_loss": 0.378487765789032 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.000391003896196971, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 10143646.0, + "repeat_count": 1.0, + "routers_loss": 0.011922914534807205, + "skip_count": 1.0, + "step": 6286, + "text_loss": 0.2467316836118698 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00039070184429690607, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10146507.0, + "repeat_count": 1.0, + "routers_loss": 0.0059767309576272964, + "skip_count": 1.0, + "step": 6288, + "text_loss": 0.9603674411773682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003903998342801006, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10149301.0, + "repeat_count": 1.0, + "routers_loss": 0.0030056277755647898, + "skip_count": 2.0, + "step": 6290, + "text_loss": 0.36631715297698975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00039009786626228543, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10152158.0, + "repeat_count": 0.0, + "routers_loss": 0.005298118572682142, + "skip_count": 3.0, + "step": 6292, + "text_loss": 0.2876455783843994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003897959403591751, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 10155852.0, + "repeat_count": 0.0, + "routers_loss": 0.004937763791531324, + "skip_count": 2.0, + "step": 6294, + "text_loss": 0.14649681746959686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003894940566864683, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 10159164.0, + "repeat_count": 0.0, + "routers_loss": 0.0021474575623869896, + "skip_count": 0.0, + "step": 6296, + "text_loss": 0.5694304704666138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 29.568241855004402, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.00038919221535984753, + "loss": 0.0073, + "macro_f1": 0.875, + "num_tokens": 10161806.0, + "repeat_count": 1.0, + "routers_loss": 0.040340203791856766, + "skip_count": 3.0, + "step": 6298, + "text_loss": 0.1574537754058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038889041649497894, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10165669.0, + "repeat_count": 0.0, + "routers_loss": 0.0028486931696534157, + "skip_count": 0.0, + "step": 6300, + "text_loss": 0.9158071279525757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003885886602075123, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10168945.0, + "repeat_count": 0.0, + "routers_loss": 0.006565484683960676, + "skip_count": 2.0, + "step": 6302, + "text_loss": 0.3530846834182739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038828694661308116, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10171914.0, + "repeat_count": 0.0, + "routers_loss": 0.0009084723424166441, + "skip_count": 0.0, + "step": 6304, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0003879852758273029, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10175737.0, + "repeat_count": 1.0, + "routers_loss": 0.004121702630072832, + "skip_count": 2.0, + "step": 6306, + "text_loss": 0.5294032096862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00038768364796577814, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10178543.0, + "repeat_count": 0.0, + "routers_loss": 0.0013208909658715129, + "skip_count": 0.0, + "step": 6308, + "text_loss": 0.41084006428718567 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.62459641913707, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00038738206314409144, + "loss": 0.0079, + "macro_f1": 0.9247862696647644, + "num_tokens": 10181880.0, + "repeat_count": 3.0, + "routers_loss": 0.03674180060625076, + "skip_count": 6.0, + "step": 6310, + "text_loss": 0.6920746564865112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0003870805214778106, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10185173.0, + "repeat_count": 0.0, + "routers_loss": 0.00221974472515285, + "skip_count": 2.0, + "step": 6312, + "text_loss": 0.1376657634973526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003867790230824869, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10188642.0, + "repeat_count": 0.0, + "routers_loss": 0.001809283159673214, + "skip_count": 0.0, + "step": 6314, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003864775680736552, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10191750.0, + "repeat_count": 0.0, + "routers_loss": 0.0013956360053271055, + "skip_count": 0.0, + "step": 6316, + "text_loss": 0.4109838902950287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00038617615656683356, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10194578.0, + "repeat_count": 0.0, + "routers_loss": 0.002947692759335041, + "skip_count": 2.0, + "step": 6318, + "text_loss": 0.4818590581417084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003858747886775232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10197131.0, + "repeat_count": 0.0, + "routers_loss": 0.0008140999125316739, + "skip_count": 2.0, + "step": 6320, + "text_loss": 0.4004709720611572 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003855734645212093, + "loss": 0.0089, + "macro_f1": 0.8820862174034119, + "num_tokens": 10199965.0, + "repeat_count": 2.0, + "routers_loss": 0.013056626543402672, + "skip_count": 2.0, + "step": 6322, + "text_loss": 0.3367139995098114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00038527218421335977, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 10203184.0, + "repeat_count": 1.0, + "routers_loss": 0.0038112467154860497, + "skip_count": 2.0, + "step": 6324, + "text_loss": 0.5747989416122437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003849709478694255, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 10206436.0, + "repeat_count": 0.0, + "routers_loss": 0.001232540002092719, + "skip_count": 0.0, + "step": 6326, + "text_loss": 0.4981732964515686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00038466975560484115, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10209889.0, + "repeat_count": 0.0, + "routers_loss": 0.004343799781054258, + "skip_count": 0.0, + "step": 6328, + "text_loss": 0.2160186469554901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.000384368607535024, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10212520.0, + "repeat_count": 0.0, + "routers_loss": 0.0014161963481456041, + "skip_count": 1.0, + "step": 6330, + "text_loss": 0.3556232154369354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0003840675037753745, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10215456.0, + "repeat_count": 0.0, + "routers_loss": 0.0014989010524004698, + "skip_count": 0.0, + "step": 6332, + "text_loss": 0.8510926961898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003837664444412762, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10218558.0, + "repeat_count": 0.0, + "routers_loss": 0.006702739745378494, + "skip_count": 0.0, + "step": 6334, + "text_loss": 0.3995226323604584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0003834654296480958, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10221862.0, + "repeat_count": 0.0, + "routers_loss": 0.00826781615614891, + "skip_count": 2.0, + "step": 6336, + "text_loss": 0.3534671664237976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003831644595111825, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10224820.0, + "repeat_count": 0.0, + "routers_loss": 0.002143894787877798, + "skip_count": 0.0, + "step": 6338, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 29.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04736328125, + "learning_rate": 0.0003828635341458687, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 10227479.0, + "repeat_count": 0.0, + "routers_loss": 0.012319118715822697, + "skip_count": 2.0, + "step": 6340, + "text_loss": 0.26248639822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003825626536674697, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10231347.0, + "repeat_count": 0.0, + "routers_loss": 0.00334449321962893, + "skip_count": 0.0, + "step": 6342, + "text_loss": 0.6357201337814331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.000382261818191283, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10234347.0, + "repeat_count": 0.0, + "routers_loss": 0.0027788348961621523, + "skip_count": 0.0, + "step": 6344, + "text_loss": 0.2813846468925476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00038196102783258996, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10237105.0, + "repeat_count": 0.0, + "routers_loss": 0.001545077539049089, + "skip_count": 0.0, + "step": 6346, + "text_loss": 0.47612661123275757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0003816602827066537, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10240249.0, + "repeat_count": 0.0, + "routers_loss": 0.005602670833468437, + "skip_count": 2.0, + "step": 6348, + "text_loss": 0.18197228014469147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003813595829287204, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10243417.0, + "repeat_count": 0.0, + "routers_loss": 0.0004317959537729621, + "skip_count": 0.0, + "step": 6350, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0003810589286140186, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 10246824.0, + "repeat_count": 0.0, + "routers_loss": 0.002225276781246066, + "skip_count": 0.0, + "step": 6352, + "text_loss": 0.14129821956157684 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.831229820956853, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003807583198777599, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 10249836.0, + "repeat_count": 3.0, + "routers_loss": 0.02445496805012226, + "skip_count": 1.0, + "step": 6354, + "text_loss": 0.3237064480781555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00038045775683513786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10252900.0, + "repeat_count": 0.0, + "routers_loss": 0.0009264222462661564, + "skip_count": 0.0, + "step": 6356, + "text_loss": 0.6777551174163818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0003801572396013289, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10255526.0, + "repeat_count": 1.0, + "routers_loss": 0.007189550437033176, + "skip_count": 5.0, + "step": 6358, + "text_loss": 0.25438982248306274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00037985676829149187, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10258865.0, + "repeat_count": 0.0, + "routers_loss": 0.0014201018493622541, + "skip_count": 0.0, + "step": 6360, + "text_loss": 0.5063154101371765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0003795563430207678, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10261677.0, + "repeat_count": 0.0, + "routers_loss": 0.0035477925557643175, + "skip_count": 3.0, + "step": 6362, + "text_loss": 0.4815357029438019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.878191957734078, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003792559639042803, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 10264805.0, + "repeat_count": 0.0, + "routers_loss": 0.013723359443247318, + "skip_count": 1.0, + "step": 6364, + "text_loss": 0.5563676357269287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003789556310571351, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10267885.0, + "repeat_count": 0.0, + "routers_loss": 0.0028159532230347395, + "skip_count": 0.0, + "step": 6366, + "text_loss": 0.7284183502197266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003786553445944204, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10270934.0, + "repeat_count": 0.0, + "routers_loss": 0.0005918835522606969, + "skip_count": 0.0, + "step": 6368, + "text_loss": 0.7387746572494507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0003783551046312067, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10273818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416864581406116, + "skip_count": 0.0, + "step": 6370, + "text_loss": 0.5360285043716431 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037805491128254645, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 10276494.0, + "repeat_count": 2.0, + "routers_loss": 0.002382483799010515, + "skip_count": 1.0, + "step": 6372, + "text_loss": 0.7536854147911072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00037775476466347414, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10279719.0, + "repeat_count": 0.0, + "routers_loss": 0.0021104486659169197, + "skip_count": 1.0, + "step": 6374, + "text_loss": 0.6807253956794739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0003774546648890066, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 10283000.0, + "repeat_count": 0.0, + "routers_loss": 0.003148776013404131, + "skip_count": 2.0, + "step": 6376, + "text_loss": 0.30774110555648804 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003771546120741426, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10285666.0, + "repeat_count": 1.0, + "routers_loss": 0.007700880523771048, + "skip_count": 1.0, + "step": 6378, + "text_loss": 0.4476076364517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003768546063338631, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10289127.0, + "repeat_count": 0.0, + "routers_loss": 0.0023625255562365055, + "skip_count": 1.0, + "step": 6380, + "text_loss": 0.4350969195365906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0003765546477831307, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10292485.0, + "repeat_count": 0.0, + "routers_loss": 0.001428726245649159, + "skip_count": 0.0, + "step": 6382, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003762547365368902, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10295361.0, + "repeat_count": 0.0, + "routers_loss": 0.0027160397730767727, + "skip_count": 2.0, + "step": 6384, + "text_loss": 0.3476370573043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00037595487271006807, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10298717.0, + "repeat_count": 0.0, + "routers_loss": 0.002456068294122815, + "skip_count": 0.0, + "step": 6386, + "text_loss": 0.3634916841983795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.99090108599941, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.021240234375, + "learning_rate": 0.0003756550564175727, + "loss": 0.0049, + "macro_f1": 0.9265305995941162, + "num_tokens": 10302102.0, + "repeat_count": 1.0, + "routers_loss": 0.02546076290309429, + "skip_count": 3.0, + "step": 6388, + "text_loss": 0.2422582060098648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00037535528777429426, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10305060.0, + "repeat_count": 0.0, + "routers_loss": 0.001045907847583294, + "skip_count": 0.0, + "step": 6390, + "text_loss": 0.5563194155693054 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0003750555668951045, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10307903.0, + "repeat_count": 1.0, + "routers_loss": 0.007391332648694515, + "skip_count": 2.0, + "step": 6392, + "text_loss": 0.3423991799354553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00037475589389485744, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 10311396.0, + "repeat_count": 1.0, + "routers_loss": 0.0029360291082412004, + "skip_count": 1.0, + "step": 6394, + "text_loss": 0.9877024292945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00037445626888838807, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10314250.0, + "repeat_count": 0.0, + "routers_loss": 0.0014932662015780807, + "skip_count": 0.0, + "step": 6396, + "text_loss": 0.3978523313999176 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003741566919905133, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10316894.0, + "repeat_count": 1.0, + "routers_loss": 0.007003722712397575, + "skip_count": 5.0, + "step": 6398, + "text_loss": 0.2945566475391388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00037385716331603155, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10319603.0, + "repeat_count": 1.0, + "routers_loss": 0.006710570305585861, + "skip_count": 1.0, + "step": 6400, + "text_loss": 0.2984389662742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00037355768297972275, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10322670.0, + "repeat_count": 0.0, + "routers_loss": 0.00048738415353000164, + "skip_count": 0.0, + "step": 6402, + "text_loss": 0.483262300491333 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00037325825109634837, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10326280.0, + "repeat_count": 1.0, + "routers_loss": 0.001625525183044374, + "skip_count": 1.0, + "step": 6404, + "text_loss": 0.42678722739219666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0003729588677806513, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10329008.0, + "repeat_count": 0.0, + "routers_loss": 0.004408636130392551, + "skip_count": 0.0, + "step": 6406, + "text_loss": 0.2264070063829422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0003726595331473557, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10332533.0, + "repeat_count": 0.0, + "routers_loss": 0.0038099216762930155, + "skip_count": 2.0, + "step": 6408, + "text_loss": 0.6670092940330505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003723602473111672, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10335643.0, + "repeat_count": 1.0, + "routers_loss": 0.003097689710557461, + "skip_count": 0.0, + "step": 6410, + "text_loss": 0.45228812098503113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037206101038677274, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10338522.0, + "repeat_count": 0.0, + "routers_loss": 0.005268602631986141, + "skip_count": 1.0, + "step": 6412, + "text_loss": 0.7288079857826233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003717618224888405, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 10341516.0, + "repeat_count": 0.0, + "routers_loss": 0.004640138708055019, + "skip_count": 2.0, + "step": 6414, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00037146268373201954, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10344831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006379318656399846, + "skip_count": 0.0, + "step": 6416, + "text_loss": 0.7864460945129395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003711635942309408, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10348499.0, + "repeat_count": 0.0, + "routers_loss": 0.0004005273221991956, + "skip_count": 0.0, + "step": 6418, + "text_loss": 0.605839192867279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0003708645541002159, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10351722.0, + "repeat_count": 0.0, + "routers_loss": 0.001061634044162929, + "skip_count": 0.0, + "step": 6420, + "text_loss": 0.8226510286331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 30.150278837687114, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003705655634544374, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 10355275.0, + "repeat_count": 0.0, + "routers_loss": 0.013980664312839508, + "skip_count": 2.0, + "step": 6422, + "text_loss": 0.2709597647190094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003702666224081792, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10359702.0, + "repeat_count": 1.0, + "routers_loss": 0.0013196271611377597, + "skip_count": 0.0, + "step": 6424, + "text_loss": 0.6451483368873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00036996773107599604, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10363364.0, + "repeat_count": 0.0, + "routers_loss": 0.0028023163322359324, + "skip_count": 1.0, + "step": 6426, + "text_loss": 0.2770799398422241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0003696688895724235, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10366554.0, + "repeat_count": 0.0, + "routers_loss": 0.0011023655533790588, + "skip_count": 0.0, + "step": 6428, + "text_loss": 0.5466503500938416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0003693700980119784, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10369733.0, + "repeat_count": 0.0, + "routers_loss": 0.00230707717128098, + "skip_count": 0.0, + "step": 6430, + "text_loss": 0.45667049288749695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036907135650915824, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10373382.0, + "repeat_count": 0.0, + "routers_loss": 0.0036784098483622074, + "skip_count": 2.0, + "step": 6432, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00036877266517844115, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10376202.0, + "repeat_count": 0.0, + "routers_loss": 0.0008461157558485866, + "skip_count": 0.0, + "step": 6434, + "text_loss": 0.27238601446151733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0003684740241342863, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10380748.0, + "repeat_count": 0.0, + "routers_loss": 0.0052765593864023685, + "skip_count": 0.0, + "step": 6436, + "text_loss": 0.6182295083999634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00036817543349113355, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10386148.0, + "repeat_count": 1.0, + "routers_loss": 0.005562922917306423, + "skip_count": 2.0, + "step": 6438, + "text_loss": 0.5591027140617371 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003678768933634033, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10389385.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686366491019726, + "skip_count": 0.0, + "step": 6440, + "text_loss": 0.5158660411834717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003675784038654968, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10391893.0, + "repeat_count": 0.0, + "routers_loss": 0.0022222092375159264, + "skip_count": 1.0, + "step": 6442, + "text_loss": 0.2865697741508484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003672799651117958, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 10395082.0, + "repeat_count": 0.0, + "routers_loss": 0.0030799773521721363, + "skip_count": 2.0, + "step": 6444, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003669815772166625, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10398015.0, + "repeat_count": 0.0, + "routers_loss": 0.0035721305757761, + "skip_count": 3.0, + "step": 6446, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00036668324029443975, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10400749.0, + "repeat_count": 0.0, + "routers_loss": 0.00741040613502264, + "skip_count": 4.0, + "step": 6448, + "text_loss": 0.3922366201877594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0003663849544594507, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 10404439.0, + "repeat_count": 0.0, + "routers_loss": 0.002974750241264701, + "skip_count": 2.0, + "step": 6450, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.00036608671982599927, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10408476.0, + "repeat_count": 0.0, + "routers_loss": 0.004810616374015808, + "skip_count": 0.0, + "step": 6452, + "text_loss": 0.3928622305393219 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003657885365083694, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10411533.0, + "repeat_count": 1.0, + "routers_loss": 0.005527745466679335, + "skip_count": 0.0, + "step": 6454, + "text_loss": 0.22816279530525208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.00036549040462082556, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10414501.0, + "repeat_count": 0.0, + "routers_loss": 0.0021297158673405647, + "skip_count": 0.0, + "step": 6456, + "text_loss": 0.20487719774246216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 30.31934253008512, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003651923242776124, + "loss": 0.0082, + "macro_f1": 0.6592592597007751, + "num_tokens": 10418296.0, + "repeat_count": 1.0, + "routers_loss": 0.046412210911512375, + "skip_count": 5.0, + "step": 6458, + "text_loss": 0.2890419065952301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00036489429559295484, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10421211.0, + "repeat_count": 0.0, + "routers_loss": 0.004002603702247143, + "skip_count": 0.0, + "step": 6460, + "text_loss": 0.23165544867515564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003645963186810581, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 10424231.0, + "repeat_count": 0.0, + "routers_loss": 0.003480088198557496, + "skip_count": 1.0, + "step": 6462, + "text_loss": 0.6286683082580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003642983936561075, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10427387.0, + "repeat_count": 0.0, + "routers_loss": 0.009358933195471764, + "skip_count": 2.0, + "step": 6464, + "text_loss": 0.3258316218852997 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.356912239506897, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00036400052063226816, + "loss": 0.0048, + "macro_f1": 0.9539539813995361, + "num_tokens": 10430813.0, + "repeat_count": 5.0, + "routers_loss": 0.03567950055003166, + "skip_count": 5.0, + "step": 6466, + "text_loss": 0.7278715968132019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036370269972368615, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 10434175.0, + "repeat_count": 1.0, + "routers_loss": 0.00226925453171134, + "skip_count": 2.0, + "step": 6468, + "text_loss": 0.5652450919151306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0003634049310444867, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10437393.0, + "repeat_count": 0.0, + "routers_loss": 0.0013644809368997812, + "skip_count": 0.0, + "step": 6470, + "text_loss": 0.5985191464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003631072147087753, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 10440412.0, + "repeat_count": 0.0, + "routers_loss": 0.0003114990540780127, + "skip_count": 0.0, + "step": 6472, + "text_loss": 0.5588209629058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00036280955083063747, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10443471.0, + "repeat_count": 0.0, + "routers_loss": 0.0005486322334036231, + "skip_count": 0.0, + "step": 6474, + "text_loss": 0.6969016194343567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00036251193952413865, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10446548.0, + "repeat_count": 1.0, + "routers_loss": 0.008256378583610058, + "skip_count": 2.0, + "step": 6476, + "text_loss": 0.27083566784858704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0003622143809033239, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10449478.0, + "repeat_count": 0.0, + "routers_loss": 0.001008771825581789, + "skip_count": 0.0, + "step": 6478, + "text_loss": 0.1689433604478836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00036191687508221827, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10453017.0, + "repeat_count": 1.0, + "routers_loss": 0.0014678959269076586, + "skip_count": 0.0, + "step": 6480, + "text_loss": 0.9571998715400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0003616194221748267, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10456061.0, + "repeat_count": 0.0, + "routers_loss": 0.001516164978966117, + "skip_count": 0.0, + "step": 6482, + "text_loss": 0.5750429034233093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0003613220222951335, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10459130.0, + "repeat_count": 0.0, + "routers_loss": 0.0031315975356847048, + "skip_count": 0.0, + "step": 6484, + "text_loss": 0.47120073437690735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0003610246755571029, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10462190.0, + "repeat_count": 0.0, + "routers_loss": 0.0006079549202695489, + "skip_count": 0.0, + "step": 6486, + "text_loss": 0.8426173329353333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000360727382074679, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10465233.0, + "repeat_count": 0.0, + "routers_loss": 0.00596054969355464, + "skip_count": 0.0, + "step": 6488, + "text_loss": 0.18435880541801453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.469621367772234, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00036043014196178463, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 10468135.0, + "repeat_count": 0.0, + "routers_loss": 0.008584967814385891, + "skip_count": 1.0, + "step": 6490, + "text_loss": 0.3827758729457855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00036013295533232344, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10471032.0, + "repeat_count": 2.0, + "routers_loss": 0.005076571833342314, + "skip_count": 5.0, + "step": 6492, + "text_loss": 0.1215854063630104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 30.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003598358223001776, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10474779.0, + "repeat_count": 3.0, + "routers_loss": 0.005972118582576513, + "skip_count": 0.0, + "step": 6494, + "text_loss": 0.22768665850162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003595387429792091, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10478015.0, + "repeat_count": 0.0, + "routers_loss": 0.004733685404062271, + "skip_count": 1.0, + "step": 6496, + "text_loss": 0.5013535618782043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00035924171748325916, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10481113.0, + "repeat_count": 0.0, + "routers_loss": 0.01148980576545, + "skip_count": 2.0, + "step": 6498, + "text_loss": 0.3281762897968292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003589447459261487, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10484049.0, + "repeat_count": 0.0, + "routers_loss": 0.007726775947958231, + "skip_count": 2.0, + "step": 6500, + "text_loss": 0.46294569969177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00035864782842167763, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10487443.0, + "repeat_count": 1.0, + "routers_loss": 0.0013331319205462933, + "skip_count": 0.0, + "step": 6502, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00035835096508362544, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10490535.0, + "repeat_count": 0.0, + "routers_loss": 0.0011629529763013124, + "skip_count": 0.0, + "step": 6504, + "text_loss": 0.40683525800704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00035805415602575054, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10493575.0, + "repeat_count": 0.0, + "routers_loss": 0.004780632443726063, + "skip_count": 0.0, + "step": 6506, + "text_loss": 0.37263134121894836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00035775740136179075, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10496193.0, + "repeat_count": 0.0, + "routers_loss": 0.0018355643842369318, + "skip_count": 0.0, + "step": 6508, + "text_loss": 0.2074306458234787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00035746070120546314, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10500135.0, + "repeat_count": 0.0, + "routers_loss": 0.004067617934197187, + "skip_count": 1.0, + "step": 6510, + "text_loss": 0.26313406229019165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00035716405567046383, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10503533.0, + "repeat_count": 0.0, + "routers_loss": 0.005438363179564476, + "skip_count": 0.0, + "step": 6512, + "text_loss": 0.3448122441768646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00035686746487046767, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 10506207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012895528925582767, + "skip_count": 0.0, + "step": 6514, + "text_loss": 0.43096476793289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003565709289191291, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10509257.0, + "repeat_count": 0.0, + "routers_loss": 0.003141741268336773, + "skip_count": 0.0, + "step": 6516, + "text_loss": 0.22349724173545837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003562744479300811, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10512554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005669888923875988, + "skip_count": 0.0, + "step": 6518, + "text_loss": 0.5319190621376038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00035597802201693587, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10515720.0, + "repeat_count": 0.0, + "routers_loss": 0.0020814717281609774, + "skip_count": 0.0, + "step": 6520, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003556816512932841, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10518517.0, + "repeat_count": 2.0, + "routers_loss": 0.010716461576521397, + "skip_count": 3.0, + "step": 6522, + "text_loss": 0.15843836963176727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0003553853358726959, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10521414.0, + "repeat_count": 0.0, + "routers_loss": 0.0014748790999874473, + "skip_count": 0.0, + "step": 6524, + "text_loss": 0.393892377614975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00035508907586871984, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10524210.0, + "repeat_count": 0.0, + "routers_loss": 0.0004757299611810595, + "skip_count": 0.0, + "step": 6526, + "text_loss": 0.2557907700538635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00035479287139488327, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10527327.0, + "repeat_count": 1.0, + "routers_loss": 0.002445317106321454, + "skip_count": 0.0, + "step": 6528, + "text_loss": 0.48338422179222107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003544967225646922, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10530363.0, + "repeat_count": 0.0, + "routers_loss": 0.0015845977468416095, + "skip_count": 0.0, + "step": 6530, + "text_loss": 0.6474354267120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00035420062949163166, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10533444.0, + "repeat_count": 0.0, + "routers_loss": 0.002190655330196023, + "skip_count": 0.0, + "step": 6532, + "text_loss": 0.3789777457714081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0003539045922891649, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10536711.0, + "repeat_count": 0.0, + "routers_loss": 0.00317079434171319, + "skip_count": 0.0, + "step": 6534, + "text_loss": 0.25758084654808044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00035360861107073394, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 10539849.0, + "repeat_count": 0.0, + "routers_loss": 0.0010938458144664764, + "skip_count": 0.0, + "step": 6536, + "text_loss": 0.9821014404296875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003533126859497592, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10543004.0, + "repeat_count": 0.0, + "routers_loss": 0.003071998478844762, + "skip_count": 2.0, + "step": 6538, + "text_loss": 0.6314182281494141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003530168170396401, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10545965.0, + "repeat_count": 0.0, + "routers_loss": 0.006067665759474039, + "skip_count": 2.0, + "step": 6540, + "text_loss": 0.5021927356719971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.000352721004453754, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10549188.0, + "repeat_count": 0.0, + "routers_loss": 0.0019109295681118965, + "skip_count": 0.0, + "step": 6542, + "text_loss": 0.3008780777454376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00035242524830545683, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10552298.0, + "repeat_count": 0.0, + "routers_loss": 0.007457790896296501, + "skip_count": 3.0, + "step": 6544, + "text_loss": 0.5675695538520813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003521295487080829, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 10555123.0, + "repeat_count": 0.0, + "routers_loss": 0.007243642583489418, + "skip_count": 1.0, + "step": 6546, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00035183390577494476, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10559653.0, + "repeat_count": 0.0, + "routers_loss": 0.004024330526590347, + "skip_count": 0.0, + "step": 6548, + "text_loss": 0.2634682357311249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 0.0003515383196193336, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10563770.0, + "repeat_count": 1.0, + "routers_loss": 0.010837121866643429, + "skip_count": 0.0, + "step": 6550, + "text_loss": 0.1608252227306366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0003512427903545183, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10567117.0, + "repeat_count": 0.0, + "routers_loss": 0.003473864868283272, + "skip_count": 0.0, + "step": 6552, + "text_loss": 0.231611430644989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0003509473180937464, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10570622.0, + "repeat_count": 0.0, + "routers_loss": 0.004441239405423403, + "skip_count": 1.0, + "step": 6554, + "text_loss": 0.3193909227848053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506519029502433, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10573411.0, + "repeat_count": 0.0, + "routers_loss": 0.0008821079391054809, + "skip_count": 0.0, + "step": 6556, + "text_loss": 0.4478783905506134 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0003503565450372128, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10576422.0, + "repeat_count": 1.0, + "routers_loss": 0.0014448441797867417, + "skip_count": 0.0, + "step": 6558, + "text_loss": 0.46065983176231384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003500612444678365, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10579879.0, + "repeat_count": 0.0, + "routers_loss": 0.007939066737890244, + "skip_count": 1.0, + "step": 6560, + "text_loss": 0.3299395740032196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000349766001355274, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10583067.0, + "repeat_count": 0.0, + "routers_loss": 0.010073966346681118, + "skip_count": 2.0, + "step": 6562, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00034947081581266335, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10586276.0, + "repeat_count": 0.0, + "routers_loss": 0.0062315030954778194, + "skip_count": 1.0, + "step": 6564, + "text_loss": 0.22706018388271332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003491756879531201, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10589257.0, + "repeat_count": 3.0, + "routers_loss": 0.0023778853937983513, + "skip_count": 4.0, + "step": 6566, + "text_loss": 0.5567800998687744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003488806178897377, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10592163.0, + "repeat_count": 0.0, + "routers_loss": 0.0004184350254945457, + "skip_count": 0.0, + "step": 6568, + "text_loss": 0.4027897119522095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003485856057355876, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 10595326.0, + "repeat_count": 0.0, + "routers_loss": 0.0035254736430943012, + "skip_count": 1.0, + "step": 6570, + "text_loss": 0.3044572174549103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000348290651603719, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10598236.0, + "repeat_count": 0.0, + "routers_loss": 0.0030894684605300426, + "skip_count": 0.0, + "step": 6572, + "text_loss": 0.23021161556243896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00034799575560715896, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10601653.0, + "repeat_count": 1.0, + "routers_loss": 0.0036557347048074007, + "skip_count": 0.0, + "step": 6574, + "text_loss": 0.5437754392623901 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003477009178589121, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10604581.0, + "repeat_count": 2.0, + "routers_loss": 0.021344119682908058, + "skip_count": 4.0, + "step": 6576, + "text_loss": 0.29078927636146545 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0003474061384719608, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10607676.0, + "repeat_count": 1.0, + "routers_loss": 0.0037169242277741432, + "skip_count": 1.0, + "step": 6578, + "text_loss": 1.1790896654129028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003471114175592649, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10611269.0, + "repeat_count": 2.0, + "routers_loss": 0.005873420741409063, + "skip_count": 4.0, + "step": 6580, + "text_loss": 0.36204129457473755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0003468167552337624, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 10614335.0, + "repeat_count": 1.0, + "routers_loss": 0.01030842587351799, + "skip_count": 2.0, + "step": 6582, + "text_loss": 0.20400437712669373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.00034652215160836826, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10617565.0, + "repeat_count": 0.0, + "routers_loss": 0.0025721401907503605, + "skip_count": 0.0, + "step": 6584, + "text_loss": 0.44676345586776733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00034622760679597507, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10620706.0, + "repeat_count": 0.0, + "routers_loss": 0.005751762073487043, + "skip_count": 1.0, + "step": 6586, + "text_loss": 0.4733653664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00034593312090945306, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10623916.0, + "repeat_count": 0.0, + "routers_loss": 0.0029759553726762533, + "skip_count": 3.0, + "step": 6588, + "text_loss": 0.49876922369003296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003456386940616498, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10628093.0, + "repeat_count": 0.0, + "routers_loss": 0.0010031822603195906, + "skip_count": 0.0, + "step": 6590, + "text_loss": 0.42708611488342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00034534432636539004, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10631739.0, + "repeat_count": 0.0, + "routers_loss": 0.0014793311711400747, + "skip_count": 0.0, + "step": 6592, + "text_loss": 0.18193726241588593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003450500179334762, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10634862.0, + "repeat_count": 0.0, + "routers_loss": 0.0059733521193265915, + "skip_count": 2.0, + "step": 6594, + "text_loss": 0.28596529364585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003447557688786879, + "loss": 0.0043, + "macro_f1": 0.3272727429866791, + "num_tokens": 10637758.0, + "repeat_count": 0.0, + "routers_loss": 0.0076768649742007256, + "skip_count": 1.0, + "step": 6596, + "text_loss": 0.39428210258483887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00034446157931378185, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10640440.0, + "repeat_count": 0.0, + "routers_loss": 0.0015128811355680227, + "skip_count": 0.0, + "step": 6598, + "text_loss": 0.45584383606910706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00034416744935149193, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10643600.0, + "repeat_count": 0.0, + "routers_loss": 0.000757391273509711, + "skip_count": 0.0, + "step": 6600, + "text_loss": 0.503209114074707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003438733791045294, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10646907.0, + "repeat_count": 0.0, + "routers_loss": 0.0025944956578314304, + "skip_count": 2.0, + "step": 6602, + "text_loss": 0.4370735287666321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00034357936868558255, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10649995.0, + "repeat_count": 0.0, + "routers_loss": 0.0006543452036567032, + "skip_count": 0.0, + "step": 6604, + "text_loss": 0.4125586748123169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00034328541820731663, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10653251.0, + "repeat_count": 0.0, + "routers_loss": 0.00027016724925488234, + "skip_count": 1.0, + "step": 6606, + "text_loss": 0.7309898734092712 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.023481068388612, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.020751953125, + "learning_rate": 0.00034299152778237413, + "loss": 0.0062, + "macro_f1": 0.8823530077934265, + "num_tokens": 10657229.0, + "repeat_count": 1.0, + "routers_loss": 0.01905548945069313, + "skip_count": 2.0, + "step": 6608, + "text_loss": 0.42367079854011536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0003426976975233744, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10660524.0, + "repeat_count": 0.0, + "routers_loss": 0.0004718089767266065, + "skip_count": 0.0, + "step": 6610, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00034240392754291343, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10663908.0, + "repeat_count": 1.0, + "routers_loss": 0.0027069442439824343, + "skip_count": 0.0, + "step": 6612, + "text_loss": 0.859471321105957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000342110217953565, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10667814.0, + "repeat_count": 0.0, + "routers_loss": 0.0015497280983254313, + "skip_count": 0.0, + "step": 6614, + "text_loss": 0.18337638676166534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003418165688678788, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10671630.0, + "repeat_count": 0.0, + "routers_loss": 0.0013396464055404067, + "skip_count": 0.0, + "step": 6616, + "text_loss": 0.860016405582428 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003415229803983819, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10675308.0, + "repeat_count": 0.0, + "routers_loss": 0.007542039267718792, + "skip_count": 3.0, + "step": 6618, + "text_loss": 0.15481022000312805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003412294526575779, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10678092.0, + "repeat_count": 0.0, + "routers_loss": 0.002029839437454939, + "skip_count": 2.0, + "step": 6620, + "text_loss": 0.5121933221817017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00034093598575794706, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10681382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013001341139897704, + "skip_count": 0.0, + "step": 6622, + "text_loss": 0.4555061161518097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00034064257981194655, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10684255.0, + "repeat_count": 0.0, + "routers_loss": 0.0007926415419206023, + "skip_count": 0.0, + "step": 6624, + "text_loss": 0.7298227548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003403492349320101, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 10686904.0, + "repeat_count": 0.0, + "routers_loss": 0.0021080176811665297, + "skip_count": 1.0, + "step": 6626, + "text_loss": 0.45434215664863586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.000340055951230548, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10690311.0, + "repeat_count": 0.0, + "routers_loss": 0.004011874087154865, + "skip_count": 0.0, + "step": 6628, + "text_loss": 0.15496443212032318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00033976272881994707, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10693395.0, + "repeat_count": 0.0, + "routers_loss": 0.0031893099658191204, + "skip_count": 2.0, + "step": 6630, + "text_loss": 0.5291517972946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003394695678125708, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 10697046.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124347683042288, + "skip_count": 1.0, + "step": 6632, + "text_loss": 0.2893230617046356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033917646832075886, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10700111.0, + "repeat_count": 0.0, + "routers_loss": 0.002547801472246647, + "skip_count": 0.0, + "step": 6634, + "text_loss": 0.10363512486219406 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 31.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003388834304568275, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 10703939.0, + "repeat_count": 2.0, + "routers_loss": 0.0019040531478822231, + "skip_count": 0.0, + "step": 6636, + "text_loss": 0.5185034275054932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00033859045433306975, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 10707187.0, + "repeat_count": 0.0, + "routers_loss": 0.0074104927480220795, + "skip_count": 2.0, + "step": 6638, + "text_loss": 0.1618153154850006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0003382975400617543, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10710029.0, + "repeat_count": 0.0, + "routers_loss": 0.0013861875049769878, + "skip_count": 1.0, + "step": 6640, + "text_loss": 0.6674485206604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003380046877551266, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10713318.0, + "repeat_count": 0.0, + "routers_loss": 0.0034452753607183695, + "skip_count": 0.0, + "step": 6642, + "text_loss": 0.39299124479293823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003377118975254082, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10716130.0, + "repeat_count": 0.0, + "routers_loss": 0.006802885327488184, + "skip_count": 2.0, + "step": 6644, + "text_loss": 0.12942606210708618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.20193718814206, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003374191694847968, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 10719400.0, + "repeat_count": 1.0, + "routers_loss": 0.03718209266662598, + "skip_count": 2.0, + "step": 6646, + "text_loss": 0.34327754378318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0003371265037454663, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10722108.0, + "repeat_count": 0.0, + "routers_loss": 0.006016947794705629, + "skip_count": 2.0, + "step": 6648, + "text_loss": 0.15644726157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.220722042852948, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00033683390041956663, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 10725709.0, + "repeat_count": 1.0, + "routers_loss": 0.04308273270726204, + "skip_count": 2.0, + "step": 6650, + "text_loss": 0.1875772923231125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 31.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003365413596192243, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 10728717.0, + "repeat_count": 2.0, + "routers_loss": 0.006372809875756502, + "skip_count": 1.0, + "step": 6652, + "text_loss": 0.4948291778564453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00033624888145654137, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10732082.0, + "repeat_count": 0.0, + "routers_loss": 0.0014530479675158858, + "skip_count": 0.0, + "step": 6654, + "text_loss": 0.44932305812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00033595646604359585, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10734663.0, + "repeat_count": 0.0, + "routers_loss": 0.001924810465425253, + "skip_count": 0.0, + "step": 6656, + "text_loss": 0.45626893639564514 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00033566411349244206, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10737470.0, + "repeat_count": 1.0, + "routers_loss": 0.0040014320984482765, + "skip_count": 0.0, + "step": 6658, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00033537182391510996, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10740228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008573737577535212, + "skip_count": 0.0, + "step": 6660, + "text_loss": 0.5626822113990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003350795974236055, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10742883.0, + "repeat_count": 0.0, + "routers_loss": 0.011166860349476337, + "skip_count": 1.0, + "step": 6662, + "text_loss": 0.23357805609703064 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 31.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00033478743412991037, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10746459.0, + "repeat_count": 1.0, + "routers_loss": 0.01719980500638485, + "skip_count": 6.0, + "step": 6664, + "text_loss": 0.150017648935318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033449533414598223, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 10749984.0, + "repeat_count": 0.0, + "routers_loss": 0.0038280142471194267, + "skip_count": 2.0, + "step": 6666, + "text_loss": 0.6312657594680786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033420329758375423, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 10752792.0, + "repeat_count": 0.0, + "routers_loss": 0.0007688060286454856, + "skip_count": 1.0, + "step": 6668, + "text_loss": 0.6794863939285278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00033391132455513537, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10756125.0, + "repeat_count": 0.0, + "routers_loss": 0.003196930279955268, + "skip_count": 2.0, + "step": 6670, + "text_loss": 0.22897565364837646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003336194151720102, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10759296.0, + "repeat_count": 0.0, + "routers_loss": 0.0026212623342871666, + "skip_count": 0.0, + "step": 6672, + "text_loss": 0.5236268639564514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003333275695462391, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10762574.0, + "repeat_count": 0.0, + "routers_loss": 0.007855101488530636, + "skip_count": 2.0, + "step": 6674, + "text_loss": 0.2971038818359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003330357877896577, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10765758.0, + "repeat_count": 0.0, + "routers_loss": 0.004191791173070669, + "skip_count": 2.0, + "step": 6676, + "text_loss": 0.17358586192131042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0003327440700140774, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10769396.0, + "repeat_count": 0.0, + "routers_loss": 0.004101858474314213, + "skip_count": 1.0, + "step": 6678, + "text_loss": 0.28932204842567444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.000332452416331285, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10772605.0, + "repeat_count": 0.0, + "routers_loss": 0.0008305918308906257, + "skip_count": 0.0, + "step": 6680, + "text_loss": 0.47090092301368713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0003321608268530427, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10776576.0, + "repeat_count": 0.0, + "routers_loss": 0.003022305201739073, + "skip_count": 1.0, + "step": 6682, + "text_loss": 0.4467788338661194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033186930169108795, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10779648.0, + "repeat_count": 1.0, + "routers_loss": 0.0021474999375641346, + "skip_count": 0.0, + "step": 6684, + "text_loss": 0.6249470710754395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.00033157784095713417, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 10782665.0, + "repeat_count": 0.0, + "routers_loss": 0.0025120675563812256, + "skip_count": 1.0, + "step": 6686, + "text_loss": 0.6763803958892822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003312864447628695, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10785789.0, + "repeat_count": 0.0, + "routers_loss": 0.0013111691223457456, + "skip_count": 1.0, + "step": 6688, + "text_loss": 0.6609058380126953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00033099511321995744, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 10788846.0, + "repeat_count": 0.0, + "routers_loss": 0.0012354454956948757, + "skip_count": 0.0, + "step": 6690, + "text_loss": 0.4421829283237457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0003307038464400368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10791611.0, + "repeat_count": 0.0, + "routers_loss": 0.0035219944547861814, + "skip_count": 2.0, + "step": 6692, + "text_loss": 0.16222824156284332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00033041264453472153, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10794868.0, + "repeat_count": 1.0, + "routers_loss": 0.0007216202793642879, + "skip_count": 0.0, + "step": 6694, + "text_loss": 0.37388721108436584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 31.436747872028178, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0003301215076156008, + "loss": 0.0063, + "macro_f1": 0.8803418874740601, + "num_tokens": 10797737.0, + "repeat_count": 2.0, + "routers_loss": 0.025403080508112907, + "skip_count": 7.0, + "step": 6696, + "text_loss": 0.5086690187454224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003298304357942389, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10800972.0, + "repeat_count": 0.0, + "routers_loss": 0.010532539337873459, + "skip_count": 2.0, + "step": 6698, + "text_loss": 0.22500646114349365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00032953942918217494, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10803654.0, + "repeat_count": 0.0, + "routers_loss": 0.0009591903653927147, + "skip_count": 0.0, + "step": 6700, + "text_loss": 0.6256277561187744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003292484878909232, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10807506.0, + "repeat_count": 0.0, + "routers_loss": 0.003801517654210329, + "skip_count": 2.0, + "step": 6702, + "text_loss": 0.522081196308136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00032895761203197317, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 10810163.0, + "repeat_count": 0.0, + "routers_loss": 0.002608039416372776, + "skip_count": 2.0, + "step": 6704, + "text_loss": 0.3600201904773712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00032866680171678874, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10813202.0, + "repeat_count": 0.0, + "routers_loss": 0.0026464913971722126, + "skip_count": 0.0, + "step": 6706, + "text_loss": 0.2513798773288727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00032837605705680895, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10816484.0, + "repeat_count": 0.0, + "routers_loss": 0.0027157769072800875, + "skip_count": 0.0, + "step": 6708, + "text_loss": 0.34391456842422485 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0003280853781634481, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 10819794.0, + "repeat_count": 1.0, + "routers_loss": 0.0016086180694401264, + "skip_count": 1.0, + "step": 6710, + "text_loss": 0.6535179615020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003277947651480946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10823033.0, + "repeat_count": 0.0, + "routers_loss": 0.002368347719311714, + "skip_count": 0.0, + "step": 6712, + "text_loss": 0.5596423745155334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0003275042181221119, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10826276.0, + "repeat_count": 0.0, + "routers_loss": 0.003124286886304617, + "skip_count": 0.0, + "step": 6714, + "text_loss": 0.6584402322769165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003272137371968382, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10828846.0, + "repeat_count": 0.0, + "routers_loss": 0.0006088328082114458, + "skip_count": 0.0, + "step": 6716, + "text_loss": 0.4602710008621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00032692332248358645, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10832025.0, + "repeat_count": 0.0, + "routers_loss": 0.002511275466531515, + "skip_count": 2.0, + "step": 6718, + "text_loss": 0.42790886759757996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.000326632974093644, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10835110.0, + "repeat_count": 1.0, + "routers_loss": 0.01076667383313179, + "skip_count": 0.0, + "step": 6720, + "text_loss": 0.5659847855567932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0003263426921382728, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 10838279.0, + "repeat_count": 2.0, + "routers_loss": 0.004973042290657759, + "skip_count": 2.0, + "step": 6722, + "text_loss": 0.675341010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00032605247672870964, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10841381.0, + "repeat_count": 0.0, + "routers_loss": 0.0013990222942084074, + "skip_count": 0.0, + "step": 6724, + "text_loss": 0.5389315485954285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00032576232797616554, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10844583.0, + "repeat_count": 0.0, + "routers_loss": 0.003186358604580164, + "skip_count": 1.0, + "step": 6726, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003254722459918261, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10847670.0, + "repeat_count": 0.0, + "routers_loss": 0.001443870598450303, + "skip_count": 0.0, + "step": 6728, + "text_loss": 0.6922405362129211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0003251822308868512, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10851479.0, + "repeat_count": 0.0, + "routers_loss": 0.004294445738196373, + "skip_count": 0.0, + "step": 6730, + "text_loss": 0.7145437002182007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032489228277237514, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10854489.0, + "repeat_count": 0.0, + "routers_loss": 0.0032078945077955723, + "skip_count": 0.0, + "step": 6732, + "text_loss": 0.4077773094177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032460240175950664, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10856954.0, + "repeat_count": 1.0, + "routers_loss": 0.0038214854430407286, + "skip_count": 2.0, + "step": 6734, + "text_loss": 0.32071781158447266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0003243125879593286, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10860016.0, + "repeat_count": 0.0, + "routers_loss": 0.0013407845981419086, + "skip_count": 0.0, + "step": 6736, + "text_loss": 0.45335495471954346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003240228414828984, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10863021.0, + "repeat_count": 0.0, + "routers_loss": 0.0010989385191351175, + "skip_count": 0.0, + "step": 6738, + "text_loss": 0.562619149684906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003237331624412473, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10866548.0, + "repeat_count": 0.0, + "routers_loss": 0.006139552686363459, + "skip_count": 0.0, + "step": 6740, + "text_loss": 0.14510060846805573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00032344355094538087, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10869402.0, + "repeat_count": 0.0, + "routers_loss": 0.004785746335983276, + "skip_count": 0.0, + "step": 6742, + "text_loss": 0.5655979514122009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00032315400710627876, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10874165.0, + "repeat_count": 0.0, + "routers_loss": 0.0052397786639630795, + "skip_count": 0.0, + "step": 6744, + "text_loss": 0.4785873591899872 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 31.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003228645310348948, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10876919.0, + "repeat_count": 3.0, + "routers_loss": 0.00460197776556015, + "skip_count": 1.0, + "step": 6746, + "text_loss": 0.5683879256248474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0003225751228421566, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10880179.0, + "repeat_count": 0.0, + "routers_loss": 0.0032690472435206175, + "skip_count": 0.0, + "step": 6748, + "text_loss": 0.5268497467041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.00032228578263896607, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10883711.0, + "repeat_count": 0.0, + "routers_loss": 0.0036305058747529984, + "skip_count": 0.0, + "step": 6750, + "text_loss": 0.16675594449043274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003219965105361989, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10887041.0, + "repeat_count": 0.0, + "routers_loss": 0.002453352091833949, + "skip_count": 1.0, + "step": 6752, + "text_loss": 0.7010246515274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00032170730664470465, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10890053.0, + "repeat_count": 0.0, + "routers_loss": 0.0020381701178848743, + "skip_count": 0.0, + "step": 6754, + "text_loss": 0.46637895703315735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003214181710753069, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10893501.0, + "repeat_count": 0.0, + "routers_loss": 0.004525696858763695, + "skip_count": 0.0, + "step": 6756, + "text_loss": 0.1768684983253479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003211291039388026, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10896480.0, + "repeat_count": 1.0, + "routers_loss": 0.0038154330104589462, + "skip_count": 0.0, + "step": 6758, + "text_loss": 0.7908347845077515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00032084010534596326, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10899158.0, + "repeat_count": 0.0, + "routers_loss": 0.004711449146270752, + "skip_count": 2.0, + "step": 6760, + "text_loss": 0.37209007143974304 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003205511754075335, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10901791.0, + "repeat_count": 1.0, + "routers_loss": 0.0025003373157233, + "skip_count": 1.0, + "step": 6762, + "text_loss": 0.8081201314926147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00032026231423423204, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10904817.0, + "repeat_count": 0.0, + "routers_loss": 0.007387075573205948, + "skip_count": 3.0, + "step": 6764, + "text_loss": 0.30355480313301086 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003199735219367507, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 10908018.0, + "repeat_count": 2.0, + "routers_loss": 0.04275592789053917, + "skip_count": 0.0, + "step": 6766, + "text_loss": 0.26562029123306274 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.774875256824185, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003196847986257553, + "loss": 0.008, + "macro_f1": 0.9255813956260681, + "num_tokens": 10911264.0, + "repeat_count": 3.0, + "routers_loss": 0.034824032336473465, + "skip_count": 4.0, + "step": 6768, + "text_loss": 0.2761698067188263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00031939614441188523, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10915964.0, + "repeat_count": 0.0, + "routers_loss": 0.0011179742868989706, + "skip_count": 0.0, + "step": 6770, + "text_loss": 0.4107927083969116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00031910755940575344, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10918678.0, + "repeat_count": 0.0, + "routers_loss": 0.0011521469568833709, + "skip_count": 0.0, + "step": 6772, + "text_loss": 0.43064895272254944 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.000318819043717946, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10921757.0, + "repeat_count": 1.0, + "routers_loss": 0.002861087443307042, + "skip_count": 1.0, + "step": 6774, + "text_loss": 0.5945150852203369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003185305974590229, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10924767.0, + "repeat_count": 0.0, + "routers_loss": 0.0011365334503352642, + "skip_count": 0.0, + "step": 6776, + "text_loss": 0.36615172028541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0003182422207395171, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10927750.0, + "repeat_count": 1.0, + "routers_loss": 0.0034391419030725956, + "skip_count": 0.0, + "step": 6778, + "text_loss": 0.17081251740455627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003179539136699351, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10930817.0, + "repeat_count": 0.0, + "routers_loss": 0.004941808991134167, + "skip_count": 2.0, + "step": 6780, + "text_loss": 0.7683762311935425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.840622248312297, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.00031766567636075675, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 10933882.0, + "repeat_count": 1.0, + "routers_loss": 0.017502857372164726, + "skip_count": 2.0, + "step": 6782, + "text_loss": 0.38010457158088684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003173775089224353, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10936909.0, + "repeat_count": 1.0, + "routers_loss": 0.0035372809506952763, + "skip_count": 2.0, + "step": 6784, + "text_loss": 0.5760656595230103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00031708941146539707, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10940032.0, + "repeat_count": 1.0, + "routers_loss": 0.02229934185743332, + "skip_count": 0.0, + "step": 6786, + "text_loss": 0.5767728090286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00031680138410004123, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10943217.0, + "repeat_count": 0.0, + "routers_loss": 0.0028649091254919767, + "skip_count": 1.0, + "step": 6788, + "text_loss": 0.9756367802619934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00031651342693674066, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10947847.0, + "repeat_count": 0.0, + "routers_loss": 0.0039158593863248825, + "skip_count": 2.0, + "step": 6790, + "text_loss": 0.2504335045814514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000316225540085841, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10950879.0, + "repeat_count": 0.0, + "routers_loss": 0.0022091215942054987, + "skip_count": 0.0, + "step": 6792, + "text_loss": 0.525842547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00031593772365766105, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10954960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006841494468972087, + "skip_count": 0.0, + "step": 6794, + "text_loss": 0.6383582353591919 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.906369239800412, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003156499777624926, + "loss": 0.006, + "macro_f1": 0.9539539813995361, + "num_tokens": 10958278.0, + "repeat_count": 5.0, + "routers_loss": 0.03810702636837959, + "skip_count": 5.0, + "step": 6796, + "text_loss": 0.5901661515235901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0003153623025106005, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10962412.0, + "repeat_count": 0.0, + "routers_loss": 0.00046833412488922477, + "skip_count": 0.0, + "step": 6798, + "text_loss": 0.42693984508514404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00031507469801222233, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10966037.0, + "repeat_count": 0.0, + "routers_loss": 0.006818041671067476, + "skip_count": 2.0, + "step": 6800, + "text_loss": 0.5326262712478638 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00031478716437756876, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10969369.0, + "repeat_count": 0.0, + "routers_loss": 0.0029889161232858896, + "skip_count": 0.0, + "step": 6802, + "text_loss": 0.49028220772743225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003144997017168232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10972016.0, + "repeat_count": 0.0, + "routers_loss": 0.0038266500923782587, + "skip_count": 2.0, + "step": 6804, + "text_loss": 0.43391722440719604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0003142123101401417, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10975153.0, + "repeat_count": 0.0, + "routers_loss": 0.0005866789724677801, + "skip_count": 0.0, + "step": 6806, + "text_loss": 0.5888382196426392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00031392498975765353, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10977881.0, + "repeat_count": 0.0, + "routers_loss": 0.002122384263202548, + "skip_count": 0.0, + "step": 6808, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003136377406794604, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10982025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005535652744583786, + "skip_count": 0.0, + "step": 6810, + "text_loss": 0.5788959264755249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003133505630156365, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10985419.0, + "repeat_count": 0.0, + "routers_loss": 0.010623604990541935, + "skip_count": 2.0, + "step": 6812, + "text_loss": 0.18577243387699127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00031306345687622905, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10989116.0, + "repeat_count": 0.0, + "routers_loss": 0.0004721239674836397, + "skip_count": 0.0, + "step": 6814, + "text_loss": 0.4818301200866699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0003127764223712575, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10992064.0, + "repeat_count": 0.0, + "routers_loss": 0.0004238430701661855, + "skip_count": 0.0, + "step": 6816, + "text_loss": 0.7482771277427673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003124894596107141, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10994903.0, + "repeat_count": 1.0, + "routers_loss": 0.005224394146353006, + "skip_count": 2.0, + "step": 6818, + "text_loss": 0.186603844165802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00031220256870456356, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 10998692.0, + "repeat_count": 1.0, + "routers_loss": 0.0021751862950623035, + "skip_count": 2.0, + "step": 6820, + "text_loss": 0.45633986592292786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00031191574976274284, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11001284.0, + "repeat_count": 0.0, + "routers_loss": 0.004747046157717705, + "skip_count": 4.0, + "step": 6822, + "text_loss": 0.5651670694351196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003116290028951617, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 11004293.0, + "repeat_count": 0.0, + "routers_loss": 0.0008316585444845259, + "skip_count": 0.0, + "step": 6824, + "text_loss": 0.3167279362678528 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.000311342328211702, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11007080.0, + "repeat_count": 0.0, + "routers_loss": 0.0004732926026917994, + "skip_count": 0.0, + "step": 6826, + "text_loss": 0.49171411991119385 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000311055725822218, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11010078.0, + "repeat_count": 1.0, + "routers_loss": 0.004238729365170002, + "skip_count": 0.0, + "step": 6828, + "text_loss": 0.21484950184822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003107691958365361, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11013368.0, + "repeat_count": 0.0, + "routers_loss": 0.0029175232630223036, + "skip_count": 2.0, + "step": 6830, + "text_loss": 0.3718266189098358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003104827383644555, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11016704.0, + "repeat_count": 0.0, + "routers_loss": 0.00191891985014081, + "skip_count": 0.0, + "step": 6832, + "text_loss": 0.28772637248039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00031019635351574705, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 11019651.0, + "repeat_count": 0.0, + "routers_loss": 0.004300855100154877, + "skip_count": 2.0, + "step": 6834, + "text_loss": 0.6583508849143982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000309910041400154, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11023847.0, + "repeat_count": 0.0, + "routers_loss": 0.00037701442488469183, + "skip_count": 0.0, + "step": 6836, + "text_loss": 0.36090534925460815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 32.10331670090989, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0003096238021273917, + "loss": 0.0077, + "macro_f1": 0.9265305995941162, + "num_tokens": 11027804.0, + "repeat_count": 1.0, + "routers_loss": 0.03601725772023201, + "skip_count": 3.0, + "step": 6838, + "text_loss": 0.24180401861667633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.11270912826534, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00030933763580714757, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 11030778.0, + "repeat_count": 1.0, + "routers_loss": 0.023780640214681625, + "skip_count": 2.0, + "step": 6840, + "text_loss": 0.4978102743625641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030905154254908104, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11034863.0, + "repeat_count": 1.0, + "routers_loss": 0.00565778324380517, + "skip_count": 0.0, + "step": 6842, + "text_loss": 0.558772623538971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00030876552246282356, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11038488.0, + "repeat_count": 0.0, + "routers_loss": 0.010575232096016407, + "skip_count": 0.0, + "step": 6844, + "text_loss": 0.2955974340438843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003084795756579787, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11041796.0, + "repeat_count": 0.0, + "routers_loss": 0.0015910190995782614, + "skip_count": 0.0, + "step": 6846, + "text_loss": 0.5009704828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003081937022441217, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11045141.0, + "repeat_count": 0.0, + "routers_loss": 0.0008034126949496567, + "skip_count": 0.0, + "step": 6848, + "text_loss": 0.3965311646461487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003079079023307999, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11047814.0, + "repeat_count": 2.0, + "routers_loss": 0.00810160581022501, + "skip_count": 0.0, + "step": 6850, + "text_loss": 0.24341927468776703 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003076221760275321, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11051330.0, + "repeat_count": 1.0, + "routers_loss": 0.006590691395103931, + "skip_count": 0.0, + "step": 6852, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00030733652344380936, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11055006.0, + "repeat_count": 0.0, + "routers_loss": 0.0005845054984092712, + "skip_count": 0.0, + "step": 6854, + "text_loss": 0.6621366739273071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003070509446890944, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11058470.0, + "repeat_count": 0.0, + "routers_loss": 0.0041051446460187435, + "skip_count": 1.0, + "step": 6856, + "text_loss": 0.31603100895881653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0003067654398728214, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11061620.0, + "repeat_count": 1.0, + "routers_loss": 0.001603201380930841, + "skip_count": 0.0, + "step": 6858, + "text_loss": 0.5167516469955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00030648000910439636, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11064727.0, + "repeat_count": 0.0, + "routers_loss": 0.0024816282093524933, + "skip_count": 0.0, + "step": 6860, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030619465249319693, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11068208.0, + "repeat_count": 1.0, + "routers_loss": 0.003121294779703021, + "skip_count": 0.0, + "step": 6862, + "text_loss": 0.3920222818851471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0003059093701485722, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11071315.0, + "repeat_count": 0.0, + "routers_loss": 0.0033239589538425207, + "skip_count": 1.0, + "step": 6864, + "text_loss": 0.4201887845993042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00030562416217984296, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11074144.0, + "repeat_count": 0.0, + "routers_loss": 0.0016117560444399714, + "skip_count": 0.0, + "step": 6866, + "text_loss": 0.5283045172691345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0003053390286963015, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11077152.0, + "repeat_count": 0.0, + "routers_loss": 0.003879208816215396, + "skip_count": 0.0, + "step": 6868, + "text_loss": 0.16188788414001465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00030505396980721143, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11080200.0, + "repeat_count": 0.0, + "routers_loss": 0.007632353343069553, + "skip_count": 1.0, + "step": 6870, + "text_loss": 0.25986847281455994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00030476898562180793, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11083356.0, + "repeat_count": 0.0, + "routers_loss": 0.004322016146034002, + "skip_count": 2.0, + "step": 6872, + "text_loss": 0.49556297063827515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003044840762492974, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 11086354.0, + "repeat_count": 0.0, + "routers_loss": 0.0031272871419787407, + "skip_count": 2.0, + "step": 6874, + "text_loss": 0.1658666580915451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003041992417988577, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11088850.0, + "repeat_count": 0.0, + "routers_loss": 0.005371398758143187, + "skip_count": 2.0, + "step": 6876, + "text_loss": 0.22437214851379395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003039144823796378, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11091784.0, + "repeat_count": 0.0, + "routers_loss": 0.0025086402893066406, + "skip_count": 0.0, + "step": 6878, + "text_loss": 0.7293354868888855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003036297981007581, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11095204.0, + "repeat_count": 0.0, + "routers_loss": 0.015590827912092209, + "skip_count": 1.0, + "step": 6880, + "text_loss": 0.6406328678131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003033451890713103, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11098367.0, + "repeat_count": 0.0, + "routers_loss": 0.0013142531970515847, + "skip_count": 0.0, + "step": 6882, + "text_loss": 0.5209086537361145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003030606554003571, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 11101047.0, + "repeat_count": 2.0, + "routers_loss": 0.0018484699539840221, + "skip_count": 0.0, + "step": 6884, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00030277619719693217, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11104269.0, + "repeat_count": 0.0, + "routers_loss": 0.0016667681047692895, + "skip_count": 0.0, + "step": 6886, + "text_loss": 0.7918420433998108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0003024918145700406, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 11107248.0, + "repeat_count": 0.0, + "routers_loss": 0.0008098077378235757, + "skip_count": 0.0, + "step": 6888, + "text_loss": 0.3871288299560547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003022075076286582, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 11111204.0, + "repeat_count": 0.0, + "routers_loss": 0.002324736909940839, + "skip_count": 0.0, + "step": 6890, + "text_loss": 0.3722921907901764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003019232764817321, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11114363.0, + "repeat_count": 0.0, + "routers_loss": 0.00254769716411829, + "skip_count": 0.0, + "step": 6892, + "text_loss": 0.418519526720047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00030163912123818006, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11117718.0, + "repeat_count": 0.0, + "routers_loss": 0.000547234492842108, + "skip_count": 0.0, + "step": 6894, + "text_loss": 0.6087009310722351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003013550420068909, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11120437.0, + "repeat_count": 0.0, + "routers_loss": 0.00015221568173728883, + "skip_count": 0.0, + "step": 6896, + "text_loss": 0.6013991832733154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.385089521573235, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046142578125, + "learning_rate": 0.00030107103889672436, + "loss": 0.0085, + "macro_f1": 0.5492662787437439, + "num_tokens": 11123708.0, + "repeat_count": 0.0, + "routers_loss": 0.024048971012234688, + "skip_count": 2.0, + "step": 6898, + "text_loss": 0.3612423837184906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003007871120165111, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 11127294.0, + "repeat_count": 0.0, + "routers_loss": 0.0013236473314464092, + "skip_count": 0.0, + "step": 6900, + "text_loss": 0.5277031064033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00030050326147505226, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11130270.0, + "repeat_count": 0.0, + "routers_loss": 0.0028277861420065165, + "skip_count": 0.0, + "step": 6902, + "text_loss": 0.5726971626281738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003002194873811197, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11132955.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369837388396263, + "skip_count": 0.0, + "step": 6904, + "text_loss": 0.18510448932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00029993578984345673, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 11136387.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351211696863174, + "skip_count": 0.0, + "step": 6906, + "text_loss": 0.28313153982162476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0002996521689707764, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11139740.0, + "repeat_count": 0.0, + "routers_loss": 0.00032925375853665173, + "skip_count": 0.0, + "step": 6908, + "text_loss": 0.7315025329589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002993686248717629, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11142587.0, + "repeat_count": 0.0, + "routers_loss": 0.002886304398998618, + "skip_count": 0.0, + "step": 6910, + "text_loss": 0.677378237247467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029908515765507084, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 11145415.0, + "repeat_count": 1.0, + "routers_loss": 0.0038471966981887817, + "skip_count": 0.0, + "step": 6912, + "text_loss": 0.5207083225250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002988017674293254, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11148524.0, + "repeat_count": 0.0, + "routers_loss": 0.0023522782139480114, + "skip_count": 0.0, + "step": 6914, + "text_loss": 0.42507871985435486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0002985184543031222, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11152069.0, + "repeat_count": 0.0, + "routers_loss": 0.0012464249739423394, + "skip_count": 0.0, + "step": 6916, + "text_loss": 0.5694169998168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0002982352183850274, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11155675.0, + "repeat_count": 0.0, + "routers_loss": 0.00828156154602766, + "skip_count": 2.0, + "step": 6918, + "text_loss": 0.22304373979568481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00029795205978357754, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11158555.0, + "repeat_count": 0.0, + "routers_loss": 0.0019234733190387487, + "skip_count": 0.0, + "step": 6920, + "text_loss": 0.5519064664840698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0002976689786072795, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11161407.0, + "repeat_count": 0.0, + "routers_loss": 0.0003542431222740561, + "skip_count": 0.0, + "step": 6922, + "text_loss": 0.6748810410499573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002973859749646104, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11166007.0, + "repeat_count": 0.0, + "routers_loss": 0.0004024899681098759, + "skip_count": 0.0, + "step": 6924, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 32.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000297103048964018, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 11169007.0, + "repeat_count": 0.0, + "routers_loss": 0.005519595462828875, + "skip_count": 3.0, + "step": 6926, + "text_loss": 0.3815552592277527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00029682020071392, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11172939.0, + "repeat_count": 0.0, + "routers_loss": 0.0016999440267682076, + "skip_count": 0.0, + "step": 6928, + "text_loss": 0.6727893352508545 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.535368359260346, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002965374303227044, + "loss": 0.0055, + "macro_f1": 0.5492662787437439, + "num_tokens": 11176232.0, + "repeat_count": 2.0, + "routers_loss": 0.030950307846069336, + "skip_count": 0.0, + "step": 6930, + "text_loss": 0.5577763915061951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029625473789872923, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11179775.0, + "repeat_count": 0.0, + "routers_loss": 0.00525702815502882, + "skip_count": 1.0, + "step": 6932, + "text_loss": 0.5860039591789246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.000295972123550323, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11183262.0, + "repeat_count": 1.0, + "routers_loss": 0.0048187971115112305, + "skip_count": 2.0, + "step": 6934, + "text_loss": 0.7328732013702393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.00029568958738578364, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11186591.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159632312133908, + "skip_count": 0.0, + "step": 6936, + "text_loss": 0.40563541650772095 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 0.0002954071295133801, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11190056.0, + "repeat_count": 1.0, + "routers_loss": 0.011282073333859444, + "skip_count": 1.0, + "step": 6938, + "text_loss": 0.15986496210098267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002951247500413504, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11193504.0, + "repeat_count": 3.0, + "routers_loss": 0.010220487602055073, + "skip_count": 5.0, + "step": 6940, + "text_loss": 0.2604432702064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002948424490779029, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11196725.0, + "repeat_count": 0.0, + "routers_loss": 0.002620660001412034, + "skip_count": 1.0, + "step": 6942, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029456022673121597, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11199303.0, + "repeat_count": 0.0, + "routers_loss": 0.00042651945841498673, + "skip_count": 0.0, + "step": 6944, + "text_loss": 0.5135554671287537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0002942780831094377, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11202319.0, + "repeat_count": 0.0, + "routers_loss": 0.005366047378629446, + "skip_count": 2.0, + "step": 6946, + "text_loss": 0.2809196710586548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002939960183206861, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 11205622.0, + "repeat_count": 0.0, + "routers_loss": 0.0033479216508567333, + "skip_count": 0.0, + "step": 6948, + "text_loss": 0.2013140618801117 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00029371403247304887, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11208637.0, + "repeat_count": 1.0, + "routers_loss": 0.0013508419506251812, + "skip_count": 0.0, + "step": 6950, + "text_loss": 0.4427332580089569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002934321256745833, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11211618.0, + "repeat_count": 0.0, + "routers_loss": 0.0020944071002304554, + "skip_count": 0.0, + "step": 6952, + "text_loss": 0.5406652688980103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00029315029803331704, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11214432.0, + "repeat_count": 0.0, + "routers_loss": 0.0012655078899115324, + "skip_count": 0.0, + "step": 6954, + "text_loss": 0.7720552086830139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00029286854965724686, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11218127.0, + "repeat_count": 0.0, + "routers_loss": 0.009041395038366318, + "skip_count": 0.0, + "step": 6956, + "text_loss": 0.258109986782074 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0002925868806543391, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 11221440.0, + "repeat_count": 1.0, + "routers_loss": 0.0034558263141661882, + "skip_count": 1.0, + "step": 6958, + "text_loss": 0.5378029942512512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00029230529113253, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11225391.0, + "repeat_count": 0.0, + "routers_loss": 0.005263930186629295, + "skip_count": 2.0, + "step": 6960, + "text_loss": 0.3616539537906647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0002920237811997251, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11228648.0, + "repeat_count": 0.0, + "routers_loss": 0.003730480559170246, + "skip_count": 1.0, + "step": 6962, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00029174235096379963, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11231828.0, + "repeat_count": 0.0, + "routers_loss": 0.004831735976040363, + "skip_count": 1.0, + "step": 6964, + "text_loss": 0.5718355178833008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.70443205165835, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046875, + "learning_rate": 0.0002914610005325981, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 11234984.0, + "repeat_count": 0.0, + "routers_loss": 0.03880132734775543, + "skip_count": 2.0, + "step": 6966, + "text_loss": 0.3139013946056366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002911797300139345, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 11239153.0, + "repeat_count": 0.0, + "routers_loss": 0.0006673726020380855, + "skip_count": 0.0, + "step": 6968, + "text_loss": 0.6040399074554443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029089853951559235, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11242178.0, + "repeat_count": 1.0, + "routers_loss": 0.0028971200808882713, + "skip_count": 0.0, + "step": 6970, + "text_loss": 0.304967999458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00029061742914532427, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11245865.0, + "repeat_count": 0.0, + "routers_loss": 0.0010410466929897666, + "skip_count": 0.0, + "step": 6972, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0002903363990108524, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11248806.0, + "repeat_count": 0.0, + "routers_loss": 0.002133697969838977, + "skip_count": 0.0, + "step": 6974, + "text_loss": 0.2561415433883667 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0002900554492198677, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 11251807.0, + "repeat_count": 2.0, + "routers_loss": 0.002402493730187416, + "skip_count": 0.0, + "step": 6976, + "text_loss": 0.652428388595581 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0002897745798800311, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 11254615.0, + "repeat_count": 1.0, + "routers_loss": 0.006423915736377239, + "skip_count": 0.0, + "step": 6978, + "text_loss": 0.22414511442184448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.000289493791098972, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11257721.0, + "repeat_count": 0.0, + "routers_loss": 0.002536606043577194, + "skip_count": 0.0, + "step": 6980, + "text_loss": 0.1328018754720688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00028921308298428933, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11260840.0, + "repeat_count": 0.0, + "routers_loss": 0.000745086173992604, + "skip_count": 0.0, + "step": 6982, + "text_loss": 0.61724853515625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0002889324556435509, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11264279.0, + "repeat_count": 0.0, + "routers_loss": 0.005258981604129076, + "skip_count": 0.0, + "step": 6984, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028865190918429356, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11268096.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756023598834872, + "skip_count": 0.0, + "step": 6986, + "text_loss": 0.45111921429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00028837144371402336, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11270611.0, + "repeat_count": 0.0, + "routers_loss": 0.0008175788098014891, + "skip_count": 0.0, + "step": 6988, + "text_loss": 0.5332239270210266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00028809105934021517, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11273826.0, + "repeat_count": 0.0, + "routers_loss": 0.003494064789265394, + "skip_count": 0.0, + "step": 6990, + "text_loss": 0.20264241099357605 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002878107561703127, + "loss": 0.0056, + "macro_f1": 0.8817967176437378, + "num_tokens": 11276917.0, + "repeat_count": 2.0, + "routers_loss": 0.025257345288991928, + "skip_count": 3.0, + "step": 6992, + "text_loss": 0.18000070750713348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.835926034634575, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0002875305343117289, + "loss": 0.0044, + "macro_f1": 0.6603773832321167, + "num_tokens": 11279637.0, + "repeat_count": 1.0, + "routers_loss": 0.019206687808036804, + "skip_count": 1.0, + "step": 6994, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00028725039387184504, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11282717.0, + "repeat_count": 0.0, + "routers_loss": 0.009358765557408333, + "skip_count": 1.0, + "step": 6996, + "text_loss": 0.3412095904350281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00028697033495801163, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11285433.0, + "repeat_count": 1.0, + "routers_loss": 0.0038775671273469925, + "skip_count": 1.0, + "step": 6998, + "text_loss": 0.4316727817058563 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002866903576775475, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11288414.0, + "repeat_count": 1.0, + "routers_loss": 0.004292591474950314, + "skip_count": 0.0, + "step": 7000, + "text_loss": 0.45106515288352966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.873495744056356, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0002864104621377409, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 11291811.0, + "repeat_count": 1.0, + "routers_loss": 0.02195967361330986, + "skip_count": 2.0, + "step": 7002, + "text_loss": 0.29841285943984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002861306484458481, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11295179.0, + "repeat_count": 0.0, + "routers_loss": 0.0010119527578353882, + "skip_count": 0.0, + "step": 7004, + "text_loss": 0.5218569040298462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028585091670909436, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11298182.0, + "repeat_count": 0.0, + "routers_loss": 0.002615996403619647, + "skip_count": 0.0, + "step": 7006, + "text_loss": 0.20382621884346008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028557126703467316, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 11301262.0, + "repeat_count": 0.0, + "routers_loss": 0.002726050792261958, + "skip_count": 0.0, + "step": 7008, + "text_loss": 0.26718559861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002852916995297471, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 11304590.0, + "repeat_count": 0.0, + "routers_loss": 0.0005590448854491115, + "skip_count": 0.0, + "step": 7010, + "text_loss": 0.5392091274261475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028501221430144667, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11307690.0, + "repeat_count": 0.0, + "routers_loss": 0.004541353322565556, + "skip_count": 2.0, + "step": 7012, + "text_loss": 0.16159705817699432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00028473281145687137, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11310866.0, + "repeat_count": 0.0, + "routers_loss": 0.0029630991630256176, + "skip_count": 1.0, + "step": 7014, + "text_loss": 0.9148072600364685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 32.93924273554447, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002844534911030888, + "loss": 0.0067, + "macro_f1": 0.9262410998344421, + "num_tokens": 11314517.0, + "repeat_count": 2.0, + "routers_loss": 0.023258809000253677, + "skip_count": 3.0, + "step": 7016, + "text_loss": 0.3853590488433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000284174253347135, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 11317526.0, + "repeat_count": 0.0, + "routers_loss": 0.010060093365609646, + "skip_count": 1.0, + "step": 7018, + "text_loss": 0.3412325382232666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00028389509829601444, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 11321684.0, + "repeat_count": 0.0, + "routers_loss": 0.0016713893273845315, + "skip_count": 0.0, + "step": 7020, + "text_loss": 0.9049796462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00028361602605670003, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11324709.0, + "repeat_count": 0.0, + "routers_loss": 0.004167001228779554, + "skip_count": 2.0, + "step": 7022, + "text_loss": 0.24364058673381805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00028333703673613224, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11327449.0, + "repeat_count": 0.0, + "routers_loss": 0.0027954576071351767, + "skip_count": 4.0, + "step": 7024, + "text_loss": 0.2872125506401062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00028305813044122096, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11330846.0, + "repeat_count": 0.0, + "routers_loss": 0.004644687287509441, + "skip_count": 0.0, + "step": 7026, + "text_loss": 0.1717570424079895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.00028277930727884336, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11333575.0, + "repeat_count": 0.0, + "routers_loss": 0.00557848671451211, + "skip_count": 2.0, + "step": 7028, + "text_loss": 0.3501792550086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028250056735584496, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11336899.0, + "repeat_count": 0.0, + "routers_loss": 0.0005694970604963601, + "skip_count": 0.0, + "step": 7030, + "text_loss": 0.5541794300079346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028222191077903946, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11340163.0, + "repeat_count": 0.0, + "routers_loss": 0.0032896639313548803, + "skip_count": 0.0, + "step": 7032, + "text_loss": 0.5618721842765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00028194333765520853, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11343494.0, + "repeat_count": 1.0, + "routers_loss": 0.005377276800572872, + "skip_count": 0.0, + "step": 7034, + "text_loss": 0.325153648853302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00028166484809110206, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11346126.0, + "repeat_count": 0.0, + "routers_loss": 0.001204605447128415, + "skip_count": 0.0, + "step": 7036, + "text_loss": 0.5016651749610901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00028138644219343736, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 11348879.0, + "repeat_count": 0.0, + "routers_loss": 0.005026837810873985, + "skip_count": 2.0, + "step": 7038, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00028110812006890064, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11352457.0, + "repeat_count": 0.0, + "routers_loss": 0.0019850607495754957, + "skip_count": 0.0, + "step": 7040, + "text_loss": 0.42376917600631714 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00028082988182414524, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11356602.0, + "repeat_count": 1.0, + "routers_loss": 0.003362950636073947, + "skip_count": 2.0, + "step": 7042, + "text_loss": 0.4165397882461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0002805517275657926, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 11359451.0, + "repeat_count": 0.0, + "routers_loss": 0.0019725612364709377, + "skip_count": 1.0, + "step": 7044, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0002802736574004319, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11363614.0, + "repeat_count": 0.0, + "routers_loss": 0.0013963640667498112, + "skip_count": 0.0, + "step": 7046, + "text_loss": 0.6112356185913086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00027999567143462015, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11367015.0, + "repeat_count": 0.0, + "routers_loss": 0.0005658161826431751, + "skip_count": 0.0, + "step": 7048, + "text_loss": 0.4920886754989624 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.09862048723217, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00027971776977488193, + "loss": 0.0064, + "macro_f1": 0.925203263759613, + "num_tokens": 11370489.0, + "repeat_count": 3.0, + "routers_loss": 0.03657131269574165, + "skip_count": 5.0, + "step": 7050, + "text_loss": 0.28003939986228943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00027943995252771017, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11373614.0, + "repeat_count": 0.0, + "routers_loss": 0.004096088465303183, + "skip_count": 2.0, + "step": 7052, + "text_loss": 0.3145081400871277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00027916221979956457, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11377631.0, + "repeat_count": 0.0, + "routers_loss": 0.0009888096246868372, + "skip_count": 0.0, + "step": 7054, + "text_loss": 0.4898056983947754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.126797769298506, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00027888457169687297, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11380620.0, + "repeat_count": 1.0, + "routers_loss": 0.013347696512937546, + "skip_count": 1.0, + "step": 7056, + "text_loss": 0.7011964917182922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027860700832603056, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11383297.0, + "repeat_count": 0.0, + "routers_loss": 0.000849733711220324, + "skip_count": 1.0, + "step": 7058, + "text_loss": 0.4007014334201813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002783295297934003, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11386460.0, + "repeat_count": 0.0, + "routers_loss": 0.001546313869766891, + "skip_count": 1.0, + "step": 7060, + "text_loss": 0.3992713689804077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002780521362053123, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11389605.0, + "repeat_count": 0.0, + "routers_loss": 0.001045585609972477, + "skip_count": 0.0, + "step": 7062, + "text_loss": 0.4440680146217346 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027777482766806446, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 11392105.0, + "repeat_count": 1.0, + "routers_loss": 0.00752411549910903, + "skip_count": 0.0, + "step": 7064, + "text_loss": 0.20152349770069122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 33.17375990607572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002774976042879218, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 11396142.0, + "repeat_count": 0.0, + "routers_loss": 0.019917849451303482, + "skip_count": 3.0, + "step": 7066, + "text_loss": 0.24365149438381195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00027722046617111696, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 11398827.0, + "repeat_count": 1.0, + "routers_loss": 0.0015933843096718192, + "skip_count": 0.0, + "step": 7068, + "text_loss": 0.31948477029800415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00027694341342384977, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11402623.0, + "repeat_count": 0.0, + "routers_loss": 0.0018986845389008522, + "skip_count": 2.0, + "step": 7070, + "text_loss": 0.47721394896507263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00027666644615228727, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11405628.0, + "repeat_count": 0.0, + "routers_loss": 0.002975719515234232, + "skip_count": 1.0, + "step": 7072, + "text_loss": 0.3972358703613281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002763895644625637, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11409468.0, + "repeat_count": 0.0, + "routers_loss": 0.005657708737999201, + "skip_count": 1.0, + "step": 7074, + "text_loss": 0.6004229187965393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002761127684607811, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11412572.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351903203874826, + "skip_count": 2.0, + "step": 7076, + "text_loss": 1.0837591886520386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00027583605825300795, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 11416831.0, + "repeat_count": 2.0, + "routers_loss": 0.005529445596039295, + "skip_count": 2.0, + "step": 7078, + "text_loss": 0.575986921787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00027555943394528014, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11420557.0, + "repeat_count": 0.0, + "routers_loss": 0.006243749521672726, + "skip_count": 0.0, + "step": 7080, + "text_loss": 0.606263279914856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.248899324919286, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00027528289564360064, + "loss": 0.0058, + "macro_f1": 0.6603773832321167, + "num_tokens": 11423471.0, + "repeat_count": 1.0, + "routers_loss": 0.031515009701251984, + "skip_count": 1.0, + "step": 7082, + "text_loss": 0.19393208622932434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002750064434539394, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11426732.0, + "repeat_count": 0.0, + "routers_loss": 0.0005052287015132606, + "skip_count": 0.0, + "step": 7084, + "text_loss": 0.7202399969100952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00027473007748223357, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11429391.0, + "repeat_count": 0.0, + "routers_loss": 0.005099403206259012, + "skip_count": 1.0, + "step": 7086, + "text_loss": 0.20651355385780334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027445379783438685, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11432161.0, + "repeat_count": 0.0, + "routers_loss": 0.001447655027732253, + "skip_count": 0.0, + "step": 7088, + "text_loss": 0.34758952260017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00027417760461627037, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11435417.0, + "repeat_count": 0.0, + "routers_loss": 0.000808655982837081, + "skip_count": 0.0, + "step": 7090, + "text_loss": 0.7414838671684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00027390149793372177, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11438313.0, + "repeat_count": 0.0, + "routers_loss": 0.005151710007339716, + "skip_count": 0.0, + "step": 7092, + "text_loss": 0.17792417109012604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00027362547789254574, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11441681.0, + "repeat_count": 1.0, + "routers_loss": 0.0037353152874857187, + "skip_count": 3.0, + "step": 7094, + "text_loss": 0.5577781796455383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0002733495445985135, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 11444521.0, + "repeat_count": 0.0, + "routers_loss": 0.00038075417978689075, + "skip_count": 0.0, + "step": 7096, + "text_loss": 0.5052862167358398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.32403874376284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002730736981573632, + "loss": 0.0033, + "macro_f1": 0.3272727429866791, + "num_tokens": 11448481.0, + "repeat_count": 0.0, + "routers_loss": 0.007313522044569254, + "skip_count": 1.0, + "step": 7098, + "text_loss": 0.5869139432907104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002727979386748001, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11452164.0, + "repeat_count": 0.0, + "routers_loss": 0.0020673887338489294, + "skip_count": 0.0, + "step": 7100, + "text_loss": 0.4354212284088135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0002725222662564954, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11455995.0, + "repeat_count": 0.0, + "routers_loss": 0.0008315460290759802, + "skip_count": 0.0, + "step": 7102, + "text_loss": 0.8714128732681274 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.35221602582917, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0002722466810080874, + "loss": 0.0053, + "macro_f1": 0.6603773832321167, + "num_tokens": 11458828.0, + "repeat_count": 1.0, + "routers_loss": 0.010913078673183918, + "skip_count": 1.0, + "step": 7104, + "text_loss": 0.6226683855056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002719711830351809, + "loss": 0.0076, + "macro_f1": 0.6603773832321167, + "num_tokens": 11462448.0, + "repeat_count": 1.0, + "routers_loss": 0.040428292006254196, + "skip_count": 1.0, + "step": 7106, + "text_loss": 0.2543688118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027169577244334726, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11465796.0, + "repeat_count": 0.0, + "routers_loss": 0.004473939072340727, + "skip_count": 1.0, + "step": 7108, + "text_loss": 0.12356872111558914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00027142044933812424, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11469176.0, + "repeat_count": 0.0, + "routers_loss": 0.0017961655976250768, + "skip_count": 0.0, + "step": 7110, + "text_loss": 0.6800211668014526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0002711452138250162, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11471983.0, + "repeat_count": 2.0, + "routers_loss": 0.003279087832197547, + "skip_count": 2.0, + "step": 7112, + "text_loss": 0.340279757976532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.3991781626064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00027087006600949403, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11475656.0, + "repeat_count": 1.0, + "routers_loss": 0.017024178057909012, + "skip_count": 1.0, + "step": 7114, + "text_loss": 0.3556337058544159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0002705950059969948, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11479410.0, + "repeat_count": 0.0, + "routers_loss": 0.015487123280763626, + "skip_count": 1.0, + "step": 7116, + "text_loss": 0.4404350817203522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00027032003389292194, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11483302.0, + "repeat_count": 0.0, + "routers_loss": 0.0011217560386285186, + "skip_count": 0.0, + "step": 7118, + "text_loss": 0.46771445870399475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0002700451498026454, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11486212.0, + "repeat_count": 0.0, + "routers_loss": 0.0010832607513293624, + "skip_count": 0.0, + "step": 7120, + "text_loss": 0.6795281767845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00026977035383150106, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11489320.0, + "repeat_count": 0.0, + "routers_loss": 0.002290027216076851, + "skip_count": 1.0, + "step": 7122, + "text_loss": 0.5304523706436157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 33.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00026949564608479164, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11492056.0, + "repeat_count": 2.0, + "routers_loss": 0.009950211271643639, + "skip_count": 6.0, + "step": 7124, + "text_loss": 0.21328973770141602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0002692210266677855, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 11495165.0, + "repeat_count": 0.0, + "routers_loss": 0.0079165268689394, + "skip_count": 3.0, + "step": 7126, + "text_loss": 0.19840657711029053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00026894649568571724, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11497636.0, + "repeat_count": 0.0, + "routers_loss": 0.0013852717820554972, + "skip_count": 0.0, + "step": 7128, + "text_loss": 0.3360055088996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00026867205324378776, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11500806.0, + "repeat_count": 0.0, + "routers_loss": 0.0010151927126571536, + "skip_count": 0.0, + "step": 7130, + "text_loss": 0.6827390193939209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00026839769944716373, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11504187.0, + "repeat_count": 0.0, + "routers_loss": 0.001110393786802888, + "skip_count": 0.0, + "step": 7132, + "text_loss": 0.5081584453582764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002681234344009783, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 11507900.0, + "repeat_count": 0.0, + "routers_loss": 0.010587670840322971, + "skip_count": 1.0, + "step": 7134, + "text_loss": 0.28684356808662415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00026784925821033014, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11510627.0, + "repeat_count": 0.0, + "routers_loss": 0.006658690981566906, + "skip_count": 0.0, + "step": 7136, + "text_loss": 0.24232104420661926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00026757517098028417, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11513304.0, + "repeat_count": 0.0, + "routers_loss": 0.0014556109672412276, + "skip_count": 0.0, + "step": 7138, + "text_loss": 0.4718358516693115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 33.52127971822718, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00026730117281587116, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 11516593.0, + "repeat_count": 1.0, + "routers_loss": 0.01590067707002163, + "skip_count": 3.0, + "step": 7140, + "text_loss": 0.2810344696044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00026702726382208774, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11519776.0, + "repeat_count": 0.0, + "routers_loss": 0.0014479428064078093, + "skip_count": 0.0, + "step": 7142, + "text_loss": 0.48876339197158813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00026675344410389623, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11522499.0, + "repeat_count": 0.0, + "routers_loss": 0.003729258431121707, + "skip_count": 2.0, + "step": 7144, + "text_loss": 0.5350890755653381 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0002664797137662248, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11525220.0, + "repeat_count": 1.0, + "routers_loss": 0.0015156447188928723, + "skip_count": 1.0, + "step": 7146, + "text_loss": 0.5742373466491699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00026620607291396773, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 11527926.0, + "repeat_count": 2.0, + "routers_loss": 0.004842780064791441, + "skip_count": 2.0, + "step": 7148, + "text_loss": 0.4994547665119171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00026593252165198455, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 11531622.0, + "repeat_count": 0.0, + "routers_loss": 0.0026556351222097874, + "skip_count": 0.0, + "step": 7150, + "text_loss": 0.1567893922328949 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00026565906008510064, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11535191.0, + "repeat_count": 0.0, + "routers_loss": 0.008135059848427773, + "skip_count": 1.0, + "step": 7152, + "text_loss": 0.289173424243927 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000265385688318107, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 11539060.0, + "repeat_count": 1.0, + "routers_loss": 0.0020754633005708456, + "skip_count": 1.0, + "step": 7154, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002651124064557602, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11541662.0, + "repeat_count": 1.0, + "routers_loss": 0.0023738413583487272, + "skip_count": 0.0, + "step": 7156, + "text_loss": 0.5026801228523254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00026483921460278227, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11544763.0, + "repeat_count": 0.0, + "routers_loss": 0.003311366541311145, + "skip_count": 1.0, + "step": 7158, + "text_loss": 0.22975654900074005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0002645661128638609, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 11547649.0, + "repeat_count": 0.0, + "routers_loss": 0.0008209354127757251, + "skip_count": 0.0, + "step": 7160, + "text_loss": 0.32840636372566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00026429310134364926, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 11550648.0, + "repeat_count": 0.0, + "routers_loss": 0.0028574815951287746, + "skip_count": 0.0, + "step": 7162, + "text_loss": 0.23239612579345703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00026402018014676584, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11553790.0, + "repeat_count": 0.0, + "routers_loss": 0.005469404626637697, + "skip_count": 1.0, + "step": 7164, + "text_loss": 0.22877025604248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0002637473493777943, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11556802.0, + "repeat_count": 1.0, + "routers_loss": 0.0032242932356894016, + "skip_count": 2.0, + "step": 7166, + "text_loss": 0.6376226544380188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00026347460914128443, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 11559607.0, + "repeat_count": 1.0, + "routers_loss": 0.0040627880953252316, + "skip_count": 2.0, + "step": 7168, + "text_loss": 0.6879657506942749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00026320195954175043, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 11562677.0, + "repeat_count": 2.0, + "routers_loss": 0.020494163036346436, + "skip_count": 4.0, + "step": 7170, + "text_loss": 0.3710069954395294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.00026292940068367224, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11565948.0, + "repeat_count": 0.0, + "routers_loss": 0.002662271959707141, + "skip_count": 0.0, + "step": 7172, + "text_loss": 0.15041157603263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00026265693267149494, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11568836.0, + "repeat_count": 0.0, + "routers_loss": 0.0039914860390126705, + "skip_count": 1.0, + "step": 7174, + "text_loss": 0.5372130870819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.00026238455560962884, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11572542.0, + "repeat_count": 0.0, + "routers_loss": 0.0034708199091255665, + "skip_count": 0.0, + "step": 7176, + "text_loss": 0.2956286072731018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026211226960244914, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11575352.0, + "repeat_count": 0.0, + "routers_loss": 0.007794995326548815, + "skip_count": 2.0, + "step": 7178, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0002618400747542964, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11579110.0, + "repeat_count": 0.0, + "routers_loss": 0.0009694626205600798, + "skip_count": 0.0, + "step": 7180, + "text_loss": 0.6523211598396301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002615679711694764, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11582476.0, + "repeat_count": 0.0, + "routers_loss": 0.004227840341627598, + "skip_count": 1.0, + "step": 7182, + "text_loss": 0.1997286081314087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026129595895225965, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 11585685.0, + "repeat_count": 0.0, + "routers_loss": 0.00126146269030869, + "skip_count": 0.0, + "step": 7184, + "text_loss": 0.486299604177475 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.73730554740241, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002610240382068818, + "loss": 0.006, + "macro_f1": 0.8814815282821655, + "num_tokens": 11588804.0, + "repeat_count": 2.0, + "routers_loss": 0.04553814232349396, + "skip_count": 4.0, + "step": 7186, + "text_loss": 0.1622236669063568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00026075220903754324, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11591822.0, + "repeat_count": 0.0, + "routers_loss": 0.002460496500134468, + "skip_count": 2.0, + "step": 7188, + "text_loss": 0.5573232173919678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002604804715484095, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11594899.0, + "repeat_count": 0.0, + "routers_loss": 0.006854622159153223, + "skip_count": 1.0, + "step": 7190, + "text_loss": 0.4753095507621765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00026020882584361094, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11598333.0, + "repeat_count": 0.0, + "routers_loss": 0.001945660449564457, + "skip_count": 1.0, + "step": 7192, + "text_loss": 0.8912903666496277 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 33.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002599372720272426, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11601814.0, + "repeat_count": 4.0, + "routers_loss": 0.005749753676354885, + "skip_count": 1.0, + "step": 7194, + "text_loss": 0.6041871905326843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002596658102033643, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 11604661.0, + "repeat_count": 0.0, + "routers_loss": 0.0025942171923816204, + "skip_count": 1.0, + "step": 7196, + "text_loss": 0.4760607182979584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 33.793660111535075, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00025939444047600114, + "loss": 0.0075, + "macro_f1": 0.8807588815689087, + "num_tokens": 11608459.0, + "repeat_count": 2.0, + "routers_loss": 0.020141327753663063, + "skip_count": 6.0, + "step": 7198, + "text_loss": 0.6670252084732056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0002591231629491423, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11611489.0, + "repeat_count": 0.0, + "routers_loss": 0.005721202120184898, + "skip_count": 1.0, + "step": 7200, + "text_loss": 0.31318753957748413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00025885197772674174, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11615234.0, + "repeat_count": 0.0, + "routers_loss": 0.0027279339265078306, + "skip_count": 1.0, + "step": 7202, + "text_loss": 0.25728851556777954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00025858088491271825, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11618892.0, + "repeat_count": 0.0, + "routers_loss": 0.0006987092201597989, + "skip_count": 0.0, + "step": 7204, + "text_loss": 0.5504243969917297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00025830988461095504, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11622237.0, + "repeat_count": 0.0, + "routers_loss": 0.0029056845232844353, + "skip_count": 0.0, + "step": 7206, + "text_loss": 0.5319080948829651 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0002580389769253001, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11624713.0, + "repeat_count": 4.0, + "routers_loss": 0.007346974220126867, + "skip_count": 5.0, + "step": 7208, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0002577681619595655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11628689.0, + "repeat_count": 0.0, + "routers_loss": 0.0004166684520896524, + "skip_count": 0.0, + "step": 7210, + "text_loss": 0.37282413244247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00025749743981752824, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11631581.0, + "repeat_count": 0.0, + "routers_loss": 0.013194780796766281, + "skip_count": 2.0, + "step": 7212, + "text_loss": 0.220115065574646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0002572268106029295, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11634503.0, + "repeat_count": 0.0, + "routers_loss": 0.0009112557163462043, + "skip_count": 0.0, + "step": 7214, + "text_loss": 0.5631879568099976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00025695627441947496, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 11637790.0, + "repeat_count": 0.0, + "routers_loss": 0.011178883723914623, + "skip_count": 2.0, + "step": 7216, + "text_loss": 0.24482154846191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.887584385089525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00025668583137083447, + "loss": 0.0047, + "macro_f1": 0.32098764181137085, + "num_tokens": 11640806.0, + "repeat_count": 0.0, + "routers_loss": 0.01877705194056034, + "skip_count": 2.0, + "step": 7218, + "text_loss": 0.2229214459657669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002564154815606422, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11644479.0, + "repeat_count": 0.0, + "routers_loss": 0.0030277224723249674, + "skip_count": 0.0, + "step": 7220, + "text_loss": 0.6025711894035339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00025614522509249715, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11647340.0, + "repeat_count": 0.0, + "routers_loss": 0.002354414900764823, + "skip_count": 1.0, + "step": 7222, + "text_loss": 0.6497155427932739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002558750620699618, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11650433.0, + "repeat_count": 1.0, + "routers_loss": 0.009801039472222328, + "skip_count": 2.0, + "step": 7224, + "text_loss": 0.32049307227134705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002556049925965632, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11654451.0, + "repeat_count": 0.0, + "routers_loss": 0.002949854824692011, + "skip_count": 0.0, + "step": 7226, + "text_loss": 0.17923395335674286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00025533501677579254, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11657440.0, + "repeat_count": 1.0, + "routers_loss": 0.0032915703486651182, + "skip_count": 1.0, + "step": 7228, + "text_loss": 0.60064297914505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0002550651347111049, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11660599.0, + "repeat_count": 1.0, + "routers_loss": 0.00594533933326602, + "skip_count": 1.0, + "step": 7230, + "text_loss": 0.32829397916793823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00025479534650591976, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11663387.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214308466762304, + "skip_count": 0.0, + "step": 7232, + "text_loss": 0.7317177653312683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00025452565226362036, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11666729.0, + "repeat_count": 0.0, + "routers_loss": 0.0056374757550656796, + "skip_count": 2.0, + "step": 7234, + "text_loss": 0.3394623398780823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.00025425605208755406, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11669871.0, + "repeat_count": 0.0, + "routers_loss": 0.006422565318644047, + "skip_count": 3.0, + "step": 7236, + "text_loss": 0.1725512444972992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002539865460810322, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 11673008.0, + "repeat_count": 1.0, + "routers_loss": 0.0023537934757769108, + "skip_count": 0.0, + "step": 7238, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00025371713434733, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11675988.0, + "repeat_count": 0.0, + "routers_loss": 0.0026300614699721336, + "skip_count": 1.0, + "step": 7240, + "text_loss": 0.4877084195613861 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 34.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002534478169896864, + "loss": 0.0052, + "macro_f1": 0.9265305995941162, + "num_tokens": 11679068.0, + "repeat_count": 1.0, + "routers_loss": 0.019549336284399033, + "skip_count": 3.0, + "step": 7242, + "text_loss": 0.15101417899131775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002531785941113044, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11682205.0, + "repeat_count": 0.0, + "routers_loss": 0.007769173942506313, + "skip_count": 1.0, + "step": 7244, + "text_loss": 0.4035153090953827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0002529094658153508, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11685162.0, + "repeat_count": 0.0, + "routers_loss": 0.003636054927483201, + "skip_count": 0.0, + "step": 7246, + "text_loss": 0.21048080921173096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.00025264043220495606, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 11688512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013363865436986089, + "skip_count": 0.0, + "step": 7248, + "text_loss": 0.6582038402557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00025237149338321437, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11691753.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587349878624082, + "skip_count": 0.0, + "step": 7250, + "text_loss": 0.6899203658103943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0002521026494531835, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11694689.0, + "repeat_count": 1.0, + "routers_loss": 0.006221035961061716, + "skip_count": 0.0, + "step": 7252, + "text_loss": 0.17377600073814392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.000251833900517885, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 11697950.0, + "repeat_count": 0.0, + "routers_loss": 0.004368607886135578, + "skip_count": 1.0, + "step": 7254, + "text_loss": 0.4147649109363556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.000251565246680304, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11701214.0, + "repeat_count": 0.0, + "routers_loss": 0.0038269520737230778, + "skip_count": 2.0, + "step": 7256, + "text_loss": 0.42076823115348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00025129668804338906, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11703935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011755652958527207, + "skip_count": 0.0, + "step": 7258, + "text_loss": 0.5484340190887451 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00025102822471005247, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11706818.0, + "repeat_count": 1.0, + "routers_loss": 0.00735129788517952, + "skip_count": 2.0, + "step": 7260, + "text_loss": 0.29214802384376526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00025075985678316983, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 11709979.0, + "repeat_count": 1.0, + "routers_loss": 0.0011552777141332626, + "skip_count": 0.0, + "step": 7262, + "text_loss": 0.6514551639556885 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.10331670090989, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002504915843655802, + "loss": 0.0067, + "macro_f1": 0.8814815282821655, + "num_tokens": 11714075.0, + "repeat_count": 2.0, + "routers_loss": 0.01438678614795208, + "skip_count": 4.0, + "step": 7264, + "text_loss": 0.5144859552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002502234075600862, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11717610.0, + "repeat_count": 0.0, + "routers_loss": 0.0027831171173602343, + "skip_count": 0.0, + "step": 7266, + "text_loss": 0.6494308114051819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00024995532646945336, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 11721415.0, + "repeat_count": 0.0, + "routers_loss": 0.0012327058939263225, + "skip_count": 0.0, + "step": 7268, + "text_loss": 0.5111991763114929 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 34.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002496873411964113, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11724488.0, + "repeat_count": 2.0, + "routers_loss": 0.003060065908357501, + "skip_count": 1.0, + "step": 7270, + "text_loss": 0.5780492424964905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002494194518436523, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11727708.0, + "repeat_count": 0.0, + "routers_loss": 0.001369593315757811, + "skip_count": 0.0, + "step": 7272, + "text_loss": 0.3151950240135193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00024915165851383203, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11730897.0, + "repeat_count": 0.0, + "routers_loss": 0.005724756047129631, + "skip_count": 0.0, + "step": 7274, + "text_loss": 0.5267965197563171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00024888396130956947, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11733870.0, + "repeat_count": 1.0, + "routers_loss": 0.010036137886345387, + "skip_count": 0.0, + "step": 7276, + "text_loss": 0.5330777168273926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00024861636033344657, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11737413.0, + "repeat_count": 0.0, + "routers_loss": 0.008341848850250244, + "skip_count": 2.0, + "step": 7278, + "text_loss": 0.25949522852897644 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0002483488556880087, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 11740691.0, + "repeat_count": 1.0, + "routers_loss": 0.008208763785660267, + "skip_count": 2.0, + "step": 7280, + "text_loss": 0.1867891401052475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000248081447475764, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11743715.0, + "repeat_count": 0.0, + "routers_loss": 0.0038434381131082773, + "skip_count": 0.0, + "step": 7282, + "text_loss": 0.4835410416126251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002478141357991838, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11746818.0, + "repeat_count": 0.0, + "routers_loss": 0.0019067893736064434, + "skip_count": 0.0, + "step": 7284, + "text_loss": 0.5959038734436035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00024754692076070256, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11750160.0, + "repeat_count": 0.0, + "routers_loss": 0.007199060171842575, + "skip_count": 0.0, + "step": 7286, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002472798024627175, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11752836.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214382972568274, + "skip_count": 0.0, + "step": 7288, + "text_loss": 0.5742631554603577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002470127810075889, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11756276.0, + "repeat_count": 0.0, + "routers_loss": 0.0018025166355073452, + "skip_count": 0.0, + "step": 7290, + "text_loss": 0.6616888642311096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00024674585649763983, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11760235.0, + "repeat_count": 1.0, + "routers_loss": 0.0024077212437987328, + "skip_count": 0.0, + "step": 7292, + "text_loss": 0.7984768748283386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024647902903515614, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 11763430.0, + "repeat_count": 0.0, + "routers_loss": 0.007843999192118645, + "skip_count": 1.0, + "step": 7294, + "text_loss": 0.1943647861480713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0002462122987223869, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 11766583.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727738108485937, + "skip_count": 0.0, + "step": 7296, + "text_loss": 0.43924200534820557 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 27.0, + "epoch": 34.26298796595245, + "f1_execute": 0.9545454382896423, + "f1_repeat": 1.0, + "f1_skip": 0.75, + "grad_norm": 0.041015625, + "learning_rate": 0.0002459456656615436, + "loss": 0.0069, + "macro_f1": 0.9015151858329773, + "num_tokens": 11770360.0, + "repeat_count": 2.0, + "routers_loss": 0.04594529792666435, + "skip_count": 5.0, + "step": 7298, + "text_loss": 0.32582250237464905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002456791299548004, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11773239.0, + "repeat_count": 1.0, + "routers_loss": 0.0011880286037921906, + "skip_count": 0.0, + "step": 7300, + "text_loss": 0.7723727226257324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024541269170429435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11776945.0, + "repeat_count": 0.0, + "routers_loss": 0.0010577787179499865, + "skip_count": 0.0, + "step": 7302, + "text_loss": 0.8173839449882507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002451463510121252, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11780121.0, + "repeat_count": 0.0, + "routers_loss": 0.0019757342524826527, + "skip_count": 0.0, + "step": 7304, + "text_loss": 0.4015064239501953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000244880107980355, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 11783172.0, + "repeat_count": 0.0, + "routers_loss": 0.002577328821644187, + "skip_count": 0.0, + "step": 7306, + "text_loss": 0.5465171933174133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00024461396271100876, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 11788608.0, + "repeat_count": 0.0, + "routers_loss": 0.004162502940744162, + "skip_count": 0.0, + "step": 7308, + "text_loss": 0.2419646978378296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0002443479153060735, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11791912.0, + "repeat_count": 0.0, + "routers_loss": 0.003301614662632346, + "skip_count": 0.0, + "step": 7310, + "text_loss": 0.2568489909172058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00024408196586749964, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11794849.0, + "repeat_count": 0.0, + "routers_loss": 0.0019893983844667673, + "skip_count": 0.0, + "step": 7312, + "text_loss": 0.7044196128845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0002438161144971992, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11797587.0, + "repeat_count": 0.0, + "routers_loss": 0.006637922488152981, + "skip_count": 1.0, + "step": 7314, + "text_loss": 0.6863232254981995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000243550361297047, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11800173.0, + "repeat_count": 0.0, + "routers_loss": 0.003078785724937916, + "skip_count": 2.0, + "step": 7316, + "text_loss": 0.2868897616863251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00024328470636888005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11802889.0, + "repeat_count": 0.0, + "routers_loss": 0.0011882453691214323, + "skip_count": 0.0, + "step": 7318, + "text_loss": 0.5522798299789429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0002430191498144979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11805607.0, + "repeat_count": 0.0, + "routers_loss": 0.0008720619371160865, + "skip_count": 0.0, + "step": 7320, + "text_loss": 0.5531370639801025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00024275369173566236, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11808838.0, + "repeat_count": 1.0, + "routers_loss": 0.003213440766558051, + "skip_count": 0.0, + "step": 7322, + "text_loss": 0.5252627730369568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.00024248833223409715, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 11811965.0, + "repeat_count": 0.0, + "routers_loss": 0.004736232105642557, + "skip_count": 1.0, + "step": 7324, + "text_loss": 0.6033701300621033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00024222307141148907, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11814832.0, + "repeat_count": 0.0, + "routers_loss": 0.0007559265359304845, + "skip_count": 0.0, + "step": 7326, + "text_loss": 0.5607737302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00024195790936948626, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11818802.0, + "repeat_count": 0.0, + "routers_loss": 0.005338212475180626, + "skip_count": 2.0, + "step": 7328, + "text_loss": 0.20618735253810883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002416928462096994, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11821998.0, + "repeat_count": 0.0, + "routers_loss": 0.001919696107506752, + "skip_count": 3.0, + "step": 7330, + "text_loss": 0.42486369609832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00024142788203370107, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797834981232882, + "skip_count": 0.0, + "step": 7332, + "text_loss": 0.48403388261795044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00024116301694302621, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 11828504.0, + "repeat_count": 0.0, + "routers_loss": 0.008978237397968769, + "skip_count": 1.0, + "step": 7334, + "text_loss": 0.43872755765914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00024089825103917152, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11831171.0, + "repeat_count": 0.0, + "routers_loss": 0.004589964635670185, + "skip_count": 1.0, + "step": 7336, + "text_loss": 0.5126842260360718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024063358442359572, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11834387.0, + "repeat_count": 0.0, + "routers_loss": 0.002857893006876111, + "skip_count": 0.0, + "step": 7338, + "text_loss": 0.7521272301673889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0002403690171977197, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11838693.0, + "repeat_count": 0.0, + "routers_loss": 0.0009023012826219201, + "skip_count": 0.0, + "step": 7340, + "text_loss": 0.6335242390632629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00024010454946292586, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11841882.0, + "repeat_count": 1.0, + "routers_loss": 0.010992717929184437, + "skip_count": 0.0, + "step": 7342, + "text_loss": 0.64045649766922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002398401813205592, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11845181.0, + "repeat_count": 0.0, + "routers_loss": 0.002247930970042944, + "skip_count": 2.0, + "step": 7344, + "text_loss": 0.31022098660469055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00023957591287192577, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11848537.0, + "repeat_count": 0.0, + "routers_loss": 0.003184020286425948, + "skip_count": 2.0, + "step": 7346, + "text_loss": 0.5709269642829895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00023931174421829376, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 11851437.0, + "repeat_count": 2.0, + "routers_loss": 0.006582654081285, + "skip_count": 4.0, + "step": 7348, + "text_loss": 0.3547070026397705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00023904767546089318, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11854161.0, + "repeat_count": 1.0, + "routers_loss": 0.0022124287206679583, + "skip_count": 0.0, + "step": 7350, + "text_loss": 0.6984702348709106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023878370670091565, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11856811.0, + "repeat_count": 1.0, + "routers_loss": 0.0029868825804442167, + "skip_count": 0.0, + "step": 7352, + "text_loss": 0.25389090180397034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00023851983803951444, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11860110.0, + "repeat_count": 0.0, + "routers_loss": 0.0028468978125602007, + "skip_count": 1.0, + "step": 7354, + "text_loss": 0.5729252099990845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00023825606957780454, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11863058.0, + "repeat_count": 1.0, + "routers_loss": 0.003115740604698658, + "skip_count": 2.0, + "step": 7356, + "text_loss": 0.60753333568573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00023799240141686258, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11865865.0, + "repeat_count": 0.0, + "routers_loss": 0.0022254586219787598, + "skip_count": 0.0, + "step": 7358, + "text_loss": 0.2568866014480591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00023772883365772658, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11869133.0, + "repeat_count": 0.0, + "routers_loss": 0.0017388637643307447, + "skip_count": 0.0, + "step": 7360, + "text_loss": 0.7657097578048706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00023746536640139633, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11872988.0, + "repeat_count": 0.0, + "routers_loss": 0.002158832037821412, + "skip_count": 0.0, + "step": 7362, + "text_loss": 0.19717472791671753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00023720199974883294, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11875810.0, + "repeat_count": 0.0, + "routers_loss": 0.001037398586049676, + "skip_count": 0.0, + "step": 7364, + "text_loss": 0.47334593534469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00023693873380095876, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11878558.0, + "repeat_count": 0.0, + "routers_loss": 0.011853457428514957, + "skip_count": 5.0, + "step": 7366, + "text_loss": 0.2567826211452484 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00023667556865865824, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 11881473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015339091187343001, + "skip_count": 0.0, + "step": 7368, + "text_loss": 0.40981143712997437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00023641250442277655, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11885033.0, + "repeat_count": 1.0, + "routers_loss": 0.010062574408948421, + "skip_count": 0.0, + "step": 7370, + "text_loss": 0.3183043301105499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00023614954119412042, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11889136.0, + "repeat_count": 0.0, + "routers_loss": 0.0010769609361886978, + "skip_count": 0.0, + "step": 7372, + "text_loss": 0.5279555916786194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00023588667907345785, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11893102.0, + "repeat_count": 0.0, + "routers_loss": 0.0032862431835383177, + "skip_count": 3.0, + "step": 7374, + "text_loss": 0.5425930023193359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 34.629292632814796, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0341796875, + "learning_rate": 0.00023562391816151808, + "loss": 0.0057, + "macro_f1": 0.5934640765190125, + "num_tokens": 11895841.0, + "repeat_count": 0.0, + "routers_loss": 0.02405562624335289, + "skip_count": 3.0, + "step": 7376, + "text_loss": 0.26054954528808594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00023536125855899153, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11899594.0, + "repeat_count": 1.0, + "routers_loss": 0.008315852843225002, + "skip_count": 3.0, + "step": 7378, + "text_loss": 0.19068174064159393 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00023509870036652998, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11902843.0, + "repeat_count": 1.0, + "routers_loss": 0.006180883850902319, + "skip_count": 4.0, + "step": 7380, + "text_loss": 0.18461982905864716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023483624368474614, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11905786.0, + "repeat_count": 0.0, + "routers_loss": 0.0008856299100443721, + "skip_count": 0.0, + "step": 7382, + "text_loss": 0.5216618180274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.66686234223657, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00023457388861421397, + "loss": 0.0059, + "macro_f1": 0.32098764181137085, + "num_tokens": 11908706.0, + "repeat_count": 1.0, + "routers_loss": 0.04762765392661095, + "skip_count": 1.0, + "step": 7384, + "text_loss": 0.25329193472862244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00023431163525546833, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11911862.0, + "repeat_count": 1.0, + "routers_loss": 0.000989250373095274, + "skip_count": 1.0, + "step": 7386, + "text_loss": 0.2657507658004761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0002340494837090053, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11915483.0, + "repeat_count": 0.0, + "routers_loss": 0.0008857969660311937, + "skip_count": 0.0, + "step": 7388, + "text_loss": 0.5136669874191284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00023378743407528164, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11918778.0, + "repeat_count": 0.0, + "routers_loss": 0.0041572838090360165, + "skip_count": 1.0, + "step": 7390, + "text_loss": 0.5212553143501282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00023352548645471556, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11921916.0, + "repeat_count": 0.0, + "routers_loss": 0.0010537431808188558, + "skip_count": 0.0, + "step": 7392, + "text_loss": 0.48122525215148926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00023326364094768576, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11924273.0, + "repeat_count": 1.0, + "routers_loss": 0.004077036865055561, + "skip_count": 0.0, + "step": 7394, + "text_loss": 0.2128690630197525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00023300189765453194, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11927424.0, + "repeat_count": 0.0, + "routers_loss": 0.005371362902224064, + "skip_count": 2.0, + "step": 7396, + "text_loss": 0.19448284804821014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00023274025667555464, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11930919.0, + "repeat_count": 0.0, + "routers_loss": 0.002137752715498209, + "skip_count": 0.0, + "step": 7398, + "text_loss": 0.7537064552307129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00023247871811101512, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11933680.0, + "repeat_count": 0.0, + "routers_loss": 0.0002398790093138814, + "skip_count": 0.0, + "step": 7400, + "text_loss": 0.5589297413825989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.751394188435576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00023221728206113546, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 11937090.0, + "repeat_count": 0.0, + "routers_loss": 0.019718777388334274, + "skip_count": 1.0, + "step": 7402, + "text_loss": 0.8014751672744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0002319559486260985, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11940581.0, + "repeat_count": 0.0, + "routers_loss": 0.001230534864589572, + "skip_count": 0.0, + "step": 7404, + "text_loss": 0.5218383073806763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002316947179060477, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11943832.0, + "repeat_count": 0.0, + "routers_loss": 0.0016393321566283703, + "skip_count": 0.0, + "step": 7406, + "text_loss": 0.17122556269168854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023143359000108704, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11947025.0, + "repeat_count": 0.0, + "routers_loss": 0.005269679240882397, + "skip_count": 2.0, + "step": 7408, + "text_loss": 0.2015499323606491 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00023117256501128136, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11950077.0, + "repeat_count": 1.0, + "routers_loss": 0.005140089895576239, + "skip_count": 2.0, + "step": 7410, + "text_loss": 0.39068636298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00023091164303665592, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11953800.0, + "repeat_count": 0.0, + "routers_loss": 0.005578748416155577, + "skip_count": 0.0, + "step": 7412, + "text_loss": 0.18851874768733978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.00023065082417719624, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11956383.0, + "repeat_count": 0.0, + "routers_loss": 0.0006410991190932691, + "skip_count": 0.0, + "step": 7414, + "text_loss": 0.5663703083992004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0002303901085328491, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11959554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005902954144403338, + "skip_count": 5.0, + "step": 7416, + "text_loss": 0.5225661993026733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0002301294962035209, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11962582.0, + "repeat_count": 0.0, + "routers_loss": 0.00045644037891179323, + "skip_count": 0.0, + "step": 7418, + "text_loss": 0.40572360157966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0002298689872890789, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11965649.0, + "repeat_count": 0.0, + "routers_loss": 0.01017778366804123, + "skip_count": 2.0, + "step": 7420, + "text_loss": 0.12190715968608856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00022960858188935052, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11968850.0, + "repeat_count": 0.0, + "routers_loss": 0.0008010792662389576, + "skip_count": 0.0, + "step": 7422, + "text_loss": 0.5606820583343506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0002293482801041236, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11972064.0, + "repeat_count": 0.0, + "routers_loss": 0.001889281440526247, + "skip_count": 0.0, + "step": 7424, + "text_loss": 0.44142210483551025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00022908808203314635, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11975466.0, + "repeat_count": 0.0, + "routers_loss": 0.00647713290527463, + "skip_count": 2.0, + "step": 7426, + "text_loss": 0.23273423314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002288279877761271, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 11979875.0, + "repeat_count": 0.0, + "routers_loss": 0.004027119372040033, + "skip_count": 0.0, + "step": 7428, + "text_loss": 0.5608086585998535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0002285679974327345, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11982808.0, + "repeat_count": 0.0, + "routers_loss": 0.0009015435934998095, + "skip_count": 0.0, + "step": 7430, + "text_loss": 0.3976539373397827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002283081111025973, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11985978.0, + "repeat_count": 0.0, + "routers_loss": 0.00047143330448307097, + "skip_count": 0.0, + "step": 7432, + "text_loss": 0.4280148446559906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022804832888530447, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11988925.0, + "repeat_count": 0.0, + "routers_loss": 0.0004895820748060942, + "skip_count": 0.0, + "step": 7434, + "text_loss": 0.5137463808059692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000227788650880405, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11991631.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349024574272335, + "skip_count": 0.0, + "step": 7436, + "text_loss": 0.4306720197200775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00022752907718740807, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 11995476.0, + "repeat_count": 0.0, + "routers_loss": 0.0038723985198885202, + "skip_count": 0.0, + "step": 7438, + "text_loss": 0.6413722038269043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00022726960790578248, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 11998846.0, + "repeat_count": 1.0, + "routers_loss": 0.004433541093021631, + "skip_count": 0.0, + "step": 7440, + "text_loss": 0.6424159407615662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 34.93924273554447, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002270102431349579, + "loss": 0.0062, + "macro_f1": 0.6289562582969666, + "num_tokens": 12002228.0, + "repeat_count": 0.0, + "routers_loss": 0.023979803547263145, + "skip_count": 6.0, + "step": 7442, + "text_loss": 0.16657918691635132 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 34.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022675098297432307, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12005003.0, + "repeat_count": 3.0, + "routers_loss": 0.005645833443850279, + "skip_count": 1.0, + "step": 7444, + "text_loss": 0.6388722658157349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022649182752322705, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12007657.0, + "repeat_count": 0.0, + "routers_loss": 0.001629356062039733, + "skip_count": 2.0, + "step": 7446, + "text_loss": 0.35670006275177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00022623277688097864, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12010652.0, + "repeat_count": 0.0, + "routers_loss": 0.006375396624207497, + "skip_count": 2.0, + "step": 7448, + "text_loss": 0.24273613095283508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0002259738311468466, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12014042.0, + "repeat_count": 0.0, + "routers_loss": 0.003734540194272995, + "skip_count": 0.0, + "step": 7450, + "text_loss": 0.4262580871582031 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0002257149904200592, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 12016987.0, + "repeat_count": 1.0, + "routers_loss": 0.0027926203329116106, + "skip_count": 1.0, + "step": 7452, + "text_loss": 0.366216778755188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00022545625479980508, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 12021584.0, + "repeat_count": 0.0, + "routers_loss": 0.0008985420572571456, + "skip_count": 0.0, + "step": 7454, + "text_loss": 0.533937394618988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00022519762438523205, + "loss": 0.0029, + "macro_f1": 0.6666666865348816, + "num_tokens": 12024142.0, + "repeat_count": 0.0, + "routers_loss": 0.005394646432250738, + "skip_count": 1.0, + "step": 7456, + "text_loss": 0.2401239275932312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0002249390992754477, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12027262.0, + "repeat_count": 0.0, + "routers_loss": 0.00275063537992537, + "skip_count": 0.0, + "step": 7458, + "text_loss": 0.21824975311756134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00022468067956951944, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12030528.0, + "repeat_count": 0.0, + "routers_loss": 0.0008951274212449789, + "skip_count": 1.0, + "step": 7460, + "text_loss": 0.610903263092041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00022442236536647408, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12033699.0, + "repeat_count": 0.0, + "routers_loss": 0.004062872380018234, + "skip_count": 2.0, + "step": 7462, + "text_loss": 0.26921433210372925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00022416415676529823, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12037402.0, + "repeat_count": 0.0, + "routers_loss": 0.0023089025635272264, + "skip_count": 1.0, + "step": 7464, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00022390605386493756, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12041129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021355501376092434, + "skip_count": 2.0, + "step": 7466, + "text_loss": 0.4265538454055786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00022364805676429816, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 12044356.0, + "repeat_count": 0.0, + "routers_loss": 0.0061582159250974655, + "skip_count": 1.0, + "step": 7468, + "text_loss": 0.12020833045244217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00022339016556224467, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 12047158.0, + "repeat_count": 0.0, + "routers_loss": 0.003753372235223651, + "skip_count": 1.0, + "step": 7470, + "text_loss": 0.6406939625740051 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022313238035760158, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12050149.0, + "repeat_count": 1.0, + "routers_loss": 0.005371729377657175, + "skip_count": 5.0, + "step": 7472, + "text_loss": 0.5184400677680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002228747012491526, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12053560.0, + "repeat_count": 0.0, + "routers_loss": 0.000824139395263046, + "skip_count": 0.0, + "step": 7474, + "text_loss": 0.32644152641296387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002226171283356409, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12056309.0, + "repeat_count": 0.0, + "routers_loss": 0.0044801668263971806, + "skip_count": 1.0, + "step": 7476, + "text_loss": 0.7027081847190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00022235966171576887, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12059191.0, + "repeat_count": 0.0, + "routers_loss": 0.007496353704482317, + "skip_count": 2.0, + "step": 7478, + "text_loss": 0.28705671429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0002221023014881982, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12062365.0, + "repeat_count": 0.0, + "routers_loss": 0.0018641395727172494, + "skip_count": 1.0, + "step": 7480, + "text_loss": 0.715477466583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00022184504775154984, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12065508.0, + "repeat_count": 0.0, + "routers_loss": 0.0005825075786560774, + "skip_count": 0.0, + "step": 7482, + "text_loss": 0.7481293678283691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00022158790060440394, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12068043.0, + "repeat_count": 0.0, + "routers_loss": 0.0028906071092933416, + "skip_count": 0.0, + "step": 7484, + "text_loss": 0.6151962876319885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00022133086014529968, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12070897.0, + "repeat_count": 0.0, + "routers_loss": 0.0030862605199217796, + "skip_count": 1.0, + "step": 7486, + "text_loss": 0.4923575222492218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00022107392647273527, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 12074644.0, + "repeat_count": 0.0, + "routers_loss": 0.0011101154377683997, + "skip_count": 0.0, + "step": 7488, + "text_loss": 0.5217859148979187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00022081709968516867, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12077718.0, + "repeat_count": 0.0, + "routers_loss": 0.004303969442844391, + "skip_count": 0.0, + "step": 7490, + "text_loss": 0.18933317065238953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00022056037988101612, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12080509.0, + "repeat_count": 0.0, + "routers_loss": 0.0019941304344683886, + "skip_count": 1.0, + "step": 7492, + "text_loss": 0.6760565042495728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.00022030376715865313, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12083580.0, + "repeat_count": 0.0, + "routers_loss": 0.0017090907786041498, + "skip_count": 0.0, + "step": 7494, + "text_loss": 0.4140956401824951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002200472616164142, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12086923.0, + "repeat_count": 0.0, + "routers_loss": 0.005131757352501154, + "skip_count": 1.0, + "step": 7496, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00021979086335259269, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12090003.0, + "repeat_count": 0.0, + "routers_loss": 0.0007472267607226968, + "skip_count": 0.0, + "step": 7498, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021953457246544095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12092936.0, + "repeat_count": 0.0, + "routers_loss": 0.0012374494690448046, + "skip_count": 0.0, + "step": 7500, + "text_loss": 0.5170100331306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00021927838905317016, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12096395.0, + "repeat_count": 0.0, + "routers_loss": 0.006784295197576284, + "skip_count": 2.0, + "step": 7502, + "text_loss": 0.340880811214447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00021902231321395017, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12099743.0, + "repeat_count": 0.0, + "routers_loss": 0.0058755455538630486, + "skip_count": 1.0, + "step": 7504, + "text_loss": 0.5299809575080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021876634504590985, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12103121.0, + "repeat_count": 0.0, + "routers_loss": 0.010622406378388405, + "skip_count": 2.0, + "step": 7506, + "text_loss": 0.1817338913679123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00021851048464713662, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12105883.0, + "repeat_count": 0.0, + "routers_loss": 0.004382388666272163, + "skip_count": 3.0, + "step": 7508, + "text_loss": 0.5718557834625244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00021825473211567665, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12108936.0, + "repeat_count": 0.0, + "routers_loss": 0.001638208981603384, + "skip_count": 0.0, + "step": 7510, + "text_loss": 0.4684678316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00021799908754953468, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12112060.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894381997175515, + "skip_count": 2.0, + "step": 7512, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00021774355104667455, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12115636.0, + "repeat_count": 0.0, + "routers_loss": 0.01400370616465807, + "skip_count": 2.0, + "step": 7514, + "text_loss": 0.19512294232845306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021748812270501805, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12119116.0, + "repeat_count": 0.0, + "routers_loss": 0.005261222366243601, + "skip_count": 3.0, + "step": 7516, + "text_loss": 0.17316904664039612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0002172328026224459, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12122070.0, + "repeat_count": 0.0, + "routers_loss": 0.01021486520767212, + "skip_count": 2.0, + "step": 7518, + "text_loss": 0.2777172029018402 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00021697759089679713, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 12125386.0, + "repeat_count": 2.0, + "routers_loss": 0.005217147525399923, + "skip_count": 2.0, + "step": 7520, + "text_loss": 0.49744322896003723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00021672248762586948, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12128753.0, + "repeat_count": 0.0, + "routers_loss": 0.003868246916681528, + "skip_count": 0.0, + "step": 7522, + "text_loss": 0.4209211468696594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.32403874376284, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00021646749290741895, + "loss": 0.009, + "macro_f1": 0.6598639488220215, + "num_tokens": 12132425.0, + "repeat_count": 1.0, + "routers_loss": 0.044205982238054276, + "skip_count": 3.0, + "step": 7524, + "text_loss": 0.4180344343185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00021621260683916005, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12135740.0, + "repeat_count": 0.0, + "routers_loss": 0.0032584366854280233, + "skip_count": 2.0, + "step": 7526, + "text_loss": 0.21219655871391296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00021595782951876552, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12139239.0, + "repeat_count": 0.0, + "routers_loss": 0.002418758114799857, + "skip_count": 2.0, + "step": 7528, + "text_loss": 0.40800613164901733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0002157031610438665, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12142572.0, + "repeat_count": 1.0, + "routers_loss": 0.005265383515506983, + "skip_count": 1.0, + "step": 7530, + "text_loss": 0.7539705634117126 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002154486015120525, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 12145737.0, + "repeat_count": 1.0, + "routers_loss": 0.006648020353168249, + "skip_count": 2.0, + "step": 7532, + "text_loss": 0.7824432253837585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002151941510208712, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 12149376.0, + "repeat_count": 1.0, + "routers_loss": 0.01692759431898594, + "skip_count": 0.0, + "step": 7534, + "text_loss": 0.4476291239261627 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0002149398096678283, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12152191.0, + "repeat_count": 1.0, + "routers_loss": 0.013883143663406372, + "skip_count": 0.0, + "step": 7536, + "text_loss": 0.14996720850467682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021468557755038826, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12155084.0, + "repeat_count": 2.0, + "routers_loss": 0.009390740655362606, + "skip_count": 2.0, + "step": 7538, + "text_loss": 0.23685340583324432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0002144314547659731, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12159366.0, + "repeat_count": 0.0, + "routers_loss": 0.0025363171007484198, + "skip_count": 0.0, + "step": 7540, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021417744141196315, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12162545.0, + "repeat_count": 0.0, + "routers_loss": 0.004230613354593515, + "skip_count": 1.0, + "step": 7542, + "text_loss": 0.24885894358158112 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00021392353758569694, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12165381.0, + "repeat_count": 1.0, + "routers_loss": 0.008058524690568447, + "skip_count": 0.0, + "step": 7544, + "text_loss": 0.15833988785743713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002136697433844707, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12168304.0, + "repeat_count": 0.0, + "routers_loss": 0.0018041770672425628, + "skip_count": 0.0, + "step": 7546, + "text_loss": 0.6046217083930969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00021341605890553894, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12171040.0, + "repeat_count": 1.0, + "routers_loss": 0.008584463968873024, + "skip_count": 2.0, + "step": 7548, + "text_loss": 0.3001522719860077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021316248424611408, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12174702.0, + "repeat_count": 0.0, + "routers_loss": 0.0010506469989195466, + "skip_count": 0.0, + "step": 7550, + "text_loss": 0.2998376488685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.00021290901950336627, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12178388.0, + "repeat_count": 0.0, + "routers_loss": 0.0012753128539770842, + "skip_count": 0.0, + "step": 7552, + "text_loss": 0.8125656843185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00021265566477442384, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12181863.0, + "repeat_count": 0.0, + "routers_loss": 0.004343052394688129, + "skip_count": 2.0, + "step": 7554, + "text_loss": 0.14004671573638916 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00021240242015637268, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12185485.0, + "repeat_count": 1.0, + "routers_loss": 0.0005794052849523723, + "skip_count": 0.0, + "step": 7556, + "text_loss": 0.7116519808769226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.4837100088054, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00021214928574625664, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 12188914.0, + "repeat_count": 1.0, + "routers_loss": 0.01066325418651104, + "skip_count": 0.0, + "step": 7558, + "text_loss": 0.4664429724216461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021189626164107718, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12193042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011769415577873588, + "skip_count": 0.0, + "step": 7560, + "text_loss": 0.672637403011322 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00021164334793779388, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12195675.0, + "repeat_count": 1.0, + "routers_loss": 0.008653911761939526, + "skip_count": 1.0, + "step": 7562, + "text_loss": 0.5301182866096497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00021139054473332357, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 12198638.0, + "repeat_count": 0.0, + "routers_loss": 0.0058176578022539616, + "skip_count": 0.0, + "step": 7564, + "text_loss": 0.1889677792787552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000211137852124541, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12202312.0, + "repeat_count": 0.0, + "routers_loss": 0.0004154018242843449, + "skip_count": 0.0, + "step": 7566, + "text_loss": 0.3610386848449707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00021088527020827848, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12205112.0, + "repeat_count": 0.0, + "routers_loss": 0.0014722816413268447, + "skip_count": 0.0, + "step": 7568, + "text_loss": 0.15214823186397552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0002106327990813257, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12208103.0, + "repeat_count": 0.0, + "routers_loss": 0.0015596678713336587, + "skip_count": 0.0, + "step": 7570, + "text_loss": 0.5034125447273254 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00021038043884043022, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12211208.0, + "repeat_count": 1.0, + "routers_loss": 0.007482443004846573, + "skip_count": 0.0, + "step": 7572, + "text_loss": 0.6760116219520569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00021012818958229696, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 12214463.0, + "repeat_count": 0.0, + "routers_loss": 0.003875598544254899, + "skip_count": 2.0, + "step": 7574, + "text_loss": 0.3278147876262665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00020987605140358824, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12218199.0, + "repeat_count": 0.0, + "routers_loss": 0.007918627932667732, + "skip_count": 2.0, + "step": 7576, + "text_loss": 0.23850615322589874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00020962402440092388, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12221151.0, + "repeat_count": 0.0, + "routers_loss": 0.005424308590590954, + "skip_count": 1.0, + "step": 7578, + "text_loss": 0.5670642256736755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002093721086708812, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12224789.0, + "repeat_count": 1.0, + "routers_loss": 0.0066504343412816525, + "skip_count": 1.0, + "step": 7580, + "text_loss": 0.30404478311538696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00020912030430999452, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12228134.0, + "repeat_count": 1.0, + "routers_loss": 0.008815597742795944, + "skip_count": 0.0, + "step": 7582, + "text_loss": 0.32522889971733093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.60581156442618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05126953125, + "learning_rate": 0.0002088686114147561, + "loss": 0.0098, + "macro_f1": 0.5492662787437439, + "num_tokens": 12231335.0, + "repeat_count": 0.0, + "routers_loss": 0.03785836696624756, + "skip_count": 2.0, + "step": 7584, + "text_loss": 0.6277920603752136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00020861703008161504, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12234619.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183801926672459, + "skip_count": 0.0, + "step": 7586, + "text_loss": 0.38319316506385803 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00020836556040697767, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12237296.0, + "repeat_count": 1.0, + "routers_loss": 0.013077575713396072, + "skip_count": 1.0, + "step": 7588, + "text_loss": 0.297571063041687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00020811420248720769, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12240633.0, + "repeat_count": 0.0, + "routers_loss": 0.002858756808564067, + "skip_count": 0.0, + "step": 7590, + "text_loss": 0.2506035268306732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.000207862956418626, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12244118.0, + "repeat_count": 0.0, + "routers_loss": 0.0032624071463942528, + "skip_count": 1.0, + "step": 7592, + "text_loss": 0.19843827188014984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.00020761182229751045, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 12247367.0, + "repeat_count": 1.0, + "routers_loss": 0.005885142367333174, + "skip_count": 3.0, + "step": 7594, + "text_loss": 0.3347153067588806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 35.66216612855885, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020736080022009602, + "loss": 0.0088, + "macro_f1": 0.9452888369560242, + "num_tokens": 12250487.0, + "repeat_count": 1.0, + "routers_loss": 0.021491389721632004, + "skip_count": 4.0, + "step": 7596, + "text_loss": 0.6777212619781494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.671558555914295, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00020710989028257514, + "loss": 0.0061, + "macro_f1": 0.6595745086669922, + "num_tokens": 12253834.0, + "repeat_count": 1.0, + "routers_loss": 0.014164486899971962, + "skip_count": 4.0, + "step": 7598, + "text_loss": 0.741127610206604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0002068590925810968, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12257289.0, + "repeat_count": 0.0, + "routers_loss": 0.0012773120542988181, + "skip_count": 0.0, + "step": 7600, + "text_loss": 0.5336982607841492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002066084072117672, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12260825.0, + "repeat_count": 0.0, + "routers_loss": 0.013102042488753796, + "skip_count": 2.0, + "step": 7602, + "text_loss": 0.30410775542259216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00020635783427064942, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12264609.0, + "repeat_count": 0.0, + "routers_loss": 0.002602101070806384, + "skip_count": 0.0, + "step": 7604, + "text_loss": 0.29835572838783264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020610737385376348, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12267537.0, + "repeat_count": 0.0, + "routers_loss": 0.0053265830501914024, + "skip_count": 0.0, + "step": 7606, + "text_loss": 0.2095658779144287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00020585702605708628, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12271175.0, + "repeat_count": 0.0, + "routers_loss": 0.000614096992649138, + "skip_count": 0.0, + "step": 7608, + "text_loss": 0.8146751523017883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020560679097655137, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12274067.0, + "repeat_count": 0.0, + "routers_loss": 0.0013201923575252295, + "skip_count": 0.0, + "step": 7610, + "text_loss": 0.40818271040916443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002053566687080497, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12276946.0, + "repeat_count": 0.0, + "routers_loss": 0.004304401110857725, + "skip_count": 1.0, + "step": 7612, + "text_loss": 0.7063660025596619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0002051066593474284, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 12279760.0, + "repeat_count": 0.0, + "routers_loss": 0.0032060579396784306, + "skip_count": 1.0, + "step": 7614, + "text_loss": 0.23671887814998627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00020485676299049154, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12282737.0, + "repeat_count": 0.0, + "routers_loss": 0.005103024188429117, + "skip_count": 2.0, + "step": 7616, + "text_loss": 0.17571020126342773 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00020460697973299986, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 12286290.0, + "repeat_count": 1.0, + "routers_loss": 0.007189507596194744, + "skip_count": 1.0, + "step": 7618, + "text_loss": 0.30872994661331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002043573096706708, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 12289458.0, + "repeat_count": 0.0, + "routers_loss": 0.0010217712260782719, + "skip_count": 0.0, + "step": 7620, + "text_loss": 0.5155487060546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002041077528991784, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12292846.0, + "repeat_count": 0.0, + "routers_loss": 0.0022399788722395897, + "skip_count": 1.0, + "step": 7622, + "text_loss": 0.717949390411377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0002038583095141532, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12295673.0, + "repeat_count": 0.0, + "routers_loss": 0.0018168877577409148, + "skip_count": 0.0, + "step": 7624, + "text_loss": 0.560361385345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00020360897961118246, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12298624.0, + "repeat_count": 0.0, + "routers_loss": 0.0008487844606861472, + "skip_count": 0.0, + "step": 7626, + "text_loss": 0.6391524076461792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00020335976328580984, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 12302136.0, + "repeat_count": 0.0, + "routers_loss": 0.0006127831293269992, + "skip_count": 0.0, + "step": 7628, + "text_loss": 0.5932226777076721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.00020311066063353556, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 12305152.0, + "repeat_count": 0.0, + "routers_loss": 0.0018765819258987904, + "skip_count": 0.0, + "step": 7630, + "text_loss": 0.37831631302833557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020286167174981618, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12307771.0, + "repeat_count": 0.0, + "routers_loss": 0.0025384656619280577, + "skip_count": 0.0, + "step": 7632, + "text_loss": 0.34806445240974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0002026127967300645, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12310921.0, + "repeat_count": 0.0, + "routers_loss": 0.008239032700657845, + "skip_count": 2.0, + "step": 7634, + "text_loss": 0.34859901666641235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00020236403566965027, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12314200.0, + "repeat_count": 0.0, + "routers_loss": 0.0029505928978323936, + "skip_count": 2.0, + "step": 7636, + "text_loss": 0.2647531032562256 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0002021153886638991, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12319221.0, + "repeat_count": 1.0, + "routers_loss": 0.0014016951899975538, + "skip_count": 0.0, + "step": 7638, + "text_loss": 0.42428603768348694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.86879953037863, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04248046875, + "learning_rate": 0.00020186685580809288, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 12322204.0, + "repeat_count": 0.0, + "routers_loss": 0.01761031709611416, + "skip_count": 2.0, + "step": 7640, + "text_loss": 0.25929757952690125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00020161843719746997, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12324750.0, + "repeat_count": 0.0, + "routers_loss": 0.0023674629628658295, + "skip_count": 0.0, + "step": 7642, + "text_loss": 0.567159116268158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0002013701329272248, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12327933.0, + "repeat_count": 0.0, + "routers_loss": 0.004534341394901276, + "skip_count": 0.0, + "step": 7644, + "text_loss": 0.4765215516090393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00020112194309250797, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12330847.0, + "repeat_count": 0.0, + "routers_loss": 0.003144246758893132, + "skip_count": 2.0, + "step": 7646, + "text_loss": 0.39837369322776794 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00020087386778842642, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 12333782.0, + "repeat_count": 1.0, + "routers_loss": 0.008137194439768791, + "skip_count": 1.0, + "step": 7648, + "text_loss": 0.42175763845443726 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00020062590711004296, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 12336837.0, + "repeat_count": 1.0, + "routers_loss": 0.006499455776065588, + "skip_count": 1.0, + "step": 7650, + "text_loss": 0.18695278465747833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00020037806115237667, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12340414.0, + "repeat_count": 0.0, + "routers_loss": 0.001548365456983447, + "skip_count": 0.0, + "step": 7652, + "text_loss": 0.1981094628572464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00020013033001040255, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 12343209.0, + "repeat_count": 0.0, + "routers_loss": 0.008136926218867302, + "skip_count": 2.0, + "step": 7654, + "text_loss": 0.2231602668762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00019988271377905165, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12346158.0, + "repeat_count": 0.0, + "routers_loss": 0.00370375020429492, + "skip_count": 1.0, + "step": 7656, + "text_loss": 0.4809921383857727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00019963521255321077, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 12349279.0, + "repeat_count": 0.0, + "routers_loss": 0.00690054427832365, + "skip_count": 3.0, + "step": 7658, + "text_loss": 0.40473970770835876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001993878264277233, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 12352848.0, + "repeat_count": 1.0, + "routers_loss": 0.004367961548268795, + "skip_count": 1.0, + "step": 7660, + "text_loss": 0.3646799921989441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00019914055549738775, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12356737.0, + "repeat_count": 0.0, + "routers_loss": 0.000662159756757319, + "skip_count": 0.0, + "step": 7662, + "text_loss": 0.3703214228153229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001988933998569589, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12360085.0, + "repeat_count": 0.0, + "routers_loss": 0.0023262565955519676, + "skip_count": 0.0, + "step": 7664, + "text_loss": 0.12910836935043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0001986463596011473, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12363296.0, + "repeat_count": 0.0, + "routers_loss": 0.002686078194528818, + "skip_count": 1.0, + "step": 7666, + "text_loss": 0.39628392457962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00019839943482461914, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12366072.0, + "repeat_count": 0.0, + "routers_loss": 0.007100159768015146, + "skip_count": 1.0, + "step": 7668, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00019815262562199648, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12368940.0, + "repeat_count": 0.0, + "routers_loss": 0.004194926470518112, + "skip_count": 0.0, + "step": 7670, + "text_loss": 0.36411619186401367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00019790593208785713, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12372031.0, + "repeat_count": 0.0, + "routers_loss": 0.0041313013061881065, + "skip_count": 0.0, + "step": 7672, + "text_loss": 0.23270413279533386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019765935431673444, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12375115.0, + "repeat_count": 1.0, + "routers_loss": 0.003343774238601327, + "skip_count": 0.0, + "step": 7674, + "text_loss": 0.1686355322599411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 36.03756970942178, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019741289240311755, + "loss": 0.0058, + "macro_f1": 0.6122449040412903, + "num_tokens": 12379089.0, + "repeat_count": 0.0, + "routers_loss": 0.021328814327716827, + "skip_count": 4.0, + "step": 7676, + "text_loss": 0.9312577247619629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00019716654644145104, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12383115.0, + "repeat_count": 0.0, + "routers_loss": 0.0004511173174250871, + "skip_count": 0.0, + "step": 7678, + "text_loss": 0.3305695056915283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00019692031652613522, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12386064.0, + "repeat_count": 0.0, + "routers_loss": 0.006190002430230379, + "skip_count": 0.0, + "step": 7680, + "text_loss": 0.4829687178134918 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00019667420275152575, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 12389743.0, + "repeat_count": 2.0, + "routers_loss": 0.004575030412524939, + "skip_count": 1.0, + "step": 7682, + "text_loss": 0.5751548409461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001964282052119341, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12392481.0, + "repeat_count": 0.0, + "routers_loss": 0.002718796720728278, + "skip_count": 0.0, + "step": 7684, + "text_loss": 0.5349925756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001961823240016269, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 12395207.0, + "repeat_count": 0.0, + "routers_loss": 0.0027528523933142424, + "skip_count": 0.0, + "step": 7686, + "text_loss": 0.5322592258453369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00019593655921482624, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12398232.0, + "repeat_count": 1.0, + "routers_loss": 0.008105970919132233, + "skip_count": 0.0, + "step": 7688, + "text_loss": 0.3192061185836792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.10331670090989, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019569091094570967, + "loss": 0.0069, + "macro_f1": 0.6603773832321167, + "num_tokens": 12400862.0, + "repeat_count": 1.0, + "routers_loss": 0.024075545370578766, + "skip_count": 1.0, + "step": 7690, + "text_loss": 0.3189752697944641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001954453792884101, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12404039.0, + "repeat_count": 0.0, + "routers_loss": 0.007513802964240313, + "skip_count": 3.0, + "step": 7692, + "text_loss": 0.5985093712806702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001951999643370157, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12407085.0, + "repeat_count": 1.0, + "routers_loss": 0.009606506675481796, + "skip_count": 2.0, + "step": 7694, + "text_loss": 0.2050790935754776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00019495466618556996, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12411377.0, + "repeat_count": 0.0, + "routers_loss": 0.0007978329667821527, + "skip_count": 0.0, + "step": 7696, + "text_loss": 0.4705570638179779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019470948492807154, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12414427.0, + "repeat_count": 0.0, + "routers_loss": 0.0010737364646047354, + "skip_count": 0.0, + "step": 7698, + "text_loss": 0.6105324029922485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019446442065847448, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12417442.0, + "repeat_count": 0.0, + "routers_loss": 0.001762967323884368, + "skip_count": 0.0, + "step": 7700, + "text_loss": 0.5638618469238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00019421947347068774, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12420862.0, + "repeat_count": 0.0, + "routers_loss": 0.0015798417152836919, + "skip_count": 0.0, + "step": 7702, + "text_loss": 0.1939864307641983 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00019397464345857562, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12423876.0, + "repeat_count": 0.0, + "routers_loss": 0.005659835878759623, + "skip_count": 1.0, + "step": 7704, + "text_loss": 0.20829300582408905 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.00019372993071595723, + "loss": 0.0072, + "macro_f1": 0.9449735879898071, + "num_tokens": 12427639.0, + "repeat_count": 4.0, + "routers_loss": 0.018665846437215805, + "skip_count": 2.0, + "step": 7706, + "text_loss": 0.47913849353790283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00019348533533660727, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 12431520.0, + "repeat_count": 0.0, + "routers_loss": 0.0006690093432553113, + "skip_count": 0.0, + "step": 7708, + "text_loss": 0.494870662689209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00019324085741425511, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 12434213.0, + "repeat_count": 0.0, + "routers_loss": 0.004067352041602135, + "skip_count": 1.0, + "step": 7710, + "text_loss": 0.7631711959838867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00019299649704258504, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12437437.0, + "repeat_count": 2.0, + "routers_loss": 0.01157623715698719, + "skip_count": 0.0, + "step": 7712, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0001927522543152364, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12440507.0, + "repeat_count": 0.0, + "routers_loss": 0.001888492377474904, + "skip_count": 0.0, + "step": 7714, + "text_loss": 0.576301097869873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019250812932580352, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 12443484.0, + "repeat_count": 0.0, + "routers_loss": 0.00042988534551113844, + "skip_count": 0.0, + "step": 7716, + "text_loss": 0.5716445446014404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00019226412216783557, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12446460.0, + "repeat_count": 0.0, + "routers_loss": 0.005063199903815985, + "skip_count": 1.0, + "step": 7718, + "text_loss": 0.2700924873352051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001920202329348365, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 12449346.0, + "repeat_count": 0.0, + "routers_loss": 0.0010775640839710832, + "skip_count": 0.0, + "step": 7720, + "text_loss": 0.5162558555603027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00019177646172026513, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12452680.0, + "repeat_count": 0.0, + "routers_loss": 0.0014514096546918154, + "skip_count": 0.0, + "step": 7722, + "text_loss": 0.5753642916679382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00019153280861753497, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12455348.0, + "repeat_count": 0.0, + "routers_loss": 0.002202774863690138, + "skip_count": 1.0, + "step": 7724, + "text_loss": 0.5751997232437134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00019128927372001454, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 12458098.0, + "repeat_count": 0.0, + "routers_loss": 0.005171069409698248, + "skip_count": 0.0, + "step": 7726, + "text_loss": 0.22252975404262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019104585712102678, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12460958.0, + "repeat_count": 0.0, + "routers_loss": 0.0041033923625946045, + "skip_count": 0.0, + "step": 7728, + "text_loss": 0.18611937761306763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00019080255891384945, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12463596.0, + "repeat_count": 1.0, + "routers_loss": 0.0012201941572129726, + "skip_count": 0.0, + "step": 7730, + "text_loss": 0.47347909212112427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001905593791917148, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 12467021.0, + "repeat_count": 2.0, + "routers_loss": 0.005837214644998312, + "skip_count": 2.0, + "step": 7732, + "text_loss": 0.2055564969778061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00019031631804780974, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12469743.0, + "repeat_count": 0.0, + "routers_loss": 0.0010269953636452556, + "skip_count": 0.0, + "step": 7734, + "text_loss": 0.45995602011680603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00019007337557527582, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12473082.0, + "repeat_count": 0.0, + "routers_loss": 0.00436213007196784, + "skip_count": 1.0, + "step": 7736, + "text_loss": 0.4515823721885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00018983055186720888, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 12476100.0, + "repeat_count": 0.0, + "routers_loss": 0.003051829058676958, + "skip_count": 2.0, + "step": 7738, + "text_loss": 0.12298467755317688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0001895878470166597, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 12480231.0, + "repeat_count": 0.0, + "routers_loss": 0.008164191618561745, + "skip_count": 2.0, + "step": 7740, + "text_loss": 0.17456457018852234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.347519812151454, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.00018934526111663314, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12483894.0, + "repeat_count": 0.0, + "routers_loss": 0.008653721772134304, + "skip_count": 1.0, + "step": 7742, + "text_loss": 0.7125775814056396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00018910279426008857, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12488077.0, + "repeat_count": 0.0, + "routers_loss": 0.005024447571486235, + "skip_count": 6.0, + "step": 7744, + "text_loss": 0.833778977394104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00018886044653993966, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12490999.0, + "repeat_count": 0.0, + "routers_loss": 0.002690888475626707, + "skip_count": 0.0, + "step": 7746, + "text_loss": 0.15594039857387543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00018861821804905466, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12494765.0, + "repeat_count": 0.0, + "routers_loss": 0.006087568122893572, + "skip_count": 0.0, + "step": 7748, + "text_loss": 0.2696777880191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00018837610888025586, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12497741.0, + "repeat_count": 0.0, + "routers_loss": 0.0014629303477704525, + "skip_count": 0.0, + "step": 7750, + "text_loss": 0.6801294684410095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.00018813411912631996, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12500585.0, + "repeat_count": 0.0, + "routers_loss": 0.001163579523563385, + "skip_count": 0.0, + "step": 7752, + "text_loss": 0.41069695353507996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00018789224887997796, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12503579.0, + "repeat_count": 2.0, + "routers_loss": 0.009436148218810558, + "skip_count": 0.0, + "step": 7754, + "text_loss": 0.6993107795715332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00018765049823391472, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 12506698.0, + "repeat_count": 1.0, + "routers_loss": 0.002098206663504243, + "skip_count": 2.0, + "step": 7756, + "text_loss": 0.5704247951507568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018740886728077, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12509869.0, + "repeat_count": 0.0, + "routers_loss": 0.002066673245280981, + "skip_count": 1.0, + "step": 7758, + "text_loss": 0.7605635523796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00018716735611313707, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 12513433.0, + "repeat_count": 0.0, + "routers_loss": 0.0023439819924533367, + "skip_count": 1.0, + "step": 7760, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.441444085705896, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00018692596482356333, + "loss": 0.0057, + "macro_f1": 0.9255813956260681, + "num_tokens": 12516817.0, + "repeat_count": 3.0, + "routers_loss": 0.039019811898469925, + "skip_count": 4.0, + "step": 7762, + "text_loss": 0.3105330467224121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00018668469350455048, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12519357.0, + "repeat_count": 0.0, + "routers_loss": 0.002269966993480921, + "skip_count": 0.0, + "step": 7764, + "text_loss": 0.3700210452079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00018644354224855414, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12522072.0, + "repeat_count": 0.0, + "routers_loss": 0.001265842467546463, + "skip_count": 0.0, + "step": 7766, + "text_loss": 0.6737633943557739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00018620251114798386, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12524999.0, + "repeat_count": 0.0, + "routers_loss": 0.006547329016029835, + "skip_count": 1.0, + "step": 7768, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001859616002952033, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12527785.0, + "repeat_count": 2.0, + "routers_loss": 0.010791841894388199, + "skip_count": 3.0, + "step": 7770, + "text_loss": 0.3069820702075958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0001857208097825299, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12530801.0, + "repeat_count": 0.0, + "routers_loss": 0.00492103723809123, + "skip_count": 2.0, + "step": 7772, + "text_loss": 0.2524295151233673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001854801397022351, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12533919.0, + "repeat_count": 0.0, + "routers_loss": 0.001942967064678669, + "skip_count": 0.0, + "step": 7774, + "text_loss": 0.7855241894721985 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018523959014654407, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12537265.0, + "repeat_count": 2.0, + "routers_loss": 0.00987488217651844, + "skip_count": 2.0, + "step": 7776, + "text_loss": 0.2767317593097687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00018499916120763582, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12539695.0, + "repeat_count": 0.0, + "routers_loss": 0.0054283770732581615, + "skip_count": 1.0, + "step": 7778, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00018475885297764306, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12542881.0, + "repeat_count": 2.0, + "routers_loss": 0.00797359924763441, + "skip_count": 0.0, + "step": 7780, + "text_loss": 0.3738224506378174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0001845186655486527, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12546530.0, + "repeat_count": 0.0, + "routers_loss": 0.0045951665379107, + "skip_count": 0.0, + "step": 7782, + "text_loss": 0.2511517107486725 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 36.54476078661579, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00018427859901270482, + "loss": 0.0055, + "macro_f1": 0.9452888369560242, + "num_tokens": 12549439.0, + "repeat_count": 1.0, + "routers_loss": 0.02312052994966507, + "skip_count": 4.0, + "step": 7784, + "text_loss": 0.3837030827999115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 36.55415321397123, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.059814453125, + "learning_rate": 0.00018403865346179344, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 12553211.0, + "repeat_count": 1.0, + "routers_loss": 0.014698561280965805, + "skip_count": 3.0, + "step": 7786, + "text_loss": 0.510159432888031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.563545641326684, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00018379882898786603, + "loss": 0.0075, + "macro_f1": 0.8803418874740601, + "num_tokens": 12556497.0, + "repeat_count": 2.0, + "routers_loss": 0.023926246911287308, + "skip_count": 7.0, + "step": 7788, + "text_loss": 0.44811317324638367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00018355912568282384, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12559778.0, + "repeat_count": 0.0, + "routers_loss": 0.0011187797645106912, + "skip_count": 0.0, + "step": 7790, + "text_loss": 0.32099616527557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018331954363852166, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12562610.0, + "repeat_count": 0.0, + "routers_loss": 0.0005356677575036883, + "skip_count": 0.0, + "step": 7792, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001830800829467677, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12565886.0, + "repeat_count": 2.0, + "routers_loss": 0.0017101728590205312, + "skip_count": 0.0, + "step": 7794, + "text_loss": 0.4234761595726013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00018284074369932386, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12568728.0, + "repeat_count": 0.0, + "routers_loss": 0.0012841494753956795, + "skip_count": 0.0, + "step": 7796, + "text_loss": 0.41109147667884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001826015259879053, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12572231.0, + "repeat_count": 0.0, + "routers_loss": 0.0022388407960534096, + "skip_count": 0.0, + "step": 7798, + "text_loss": 0.5459926128387451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00018236242990418074, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12574968.0, + "repeat_count": 0.0, + "routers_loss": 0.0019992550369352102, + "skip_count": 0.0, + "step": 7800, + "text_loss": 0.5028481483459473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001821234555397722, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12579074.0, + "repeat_count": 0.0, + "routers_loss": 0.002936388598755002, + "skip_count": 2.0, + "step": 7802, + "text_loss": 0.2377086579799652 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00018188460298625503, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12581912.0, + "repeat_count": 1.0, + "routers_loss": 0.0026762608904391527, + "skip_count": 0.0, + "step": 7804, + "text_loss": 0.13887254893779755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 36.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00018164587233515824, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 12585020.0, + "repeat_count": 3.0, + "routers_loss": 0.003901638789102435, + "skip_count": 1.0, + "step": 7806, + "text_loss": 0.35454171895980835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00018140726367796373, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12588310.0, + "repeat_count": 0.0, + "routers_loss": 0.0031358697451651096, + "skip_count": 2.0, + "step": 7808, + "text_loss": 0.3567306697368622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00018116877710610673, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12591735.0, + "repeat_count": 0.0, + "routers_loss": 0.002310588024556637, + "skip_count": 1.0, + "step": 7810, + "text_loss": 0.45357072353363037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00018093041271097582, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12595232.0, + "repeat_count": 0.0, + "routers_loss": 0.005600228440016508, + "skip_count": 2.0, + "step": 7812, + "text_loss": 0.4179847836494446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.685647196947464, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00018069217058391267, + "loss": 0.006, + "macro_f1": 0.6603773832321167, + "num_tokens": 12598367.0, + "repeat_count": 1.0, + "routers_loss": 0.04015933722257614, + "skip_count": 1.0, + "step": 7814, + "text_loss": 0.17874565720558167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00018045405081621214, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12601864.0, + "repeat_count": 0.0, + "routers_loss": 0.005119446665048599, + "skip_count": 1.0, + "step": 7816, + "text_loss": 0.6867854595184326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00018021605349912207, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12605268.0, + "repeat_count": 0.0, + "routers_loss": 0.0005990012432448566, + "skip_count": 0.0, + "step": 7818, + "text_loss": 0.9084970355033875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00017997817872384358, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12608093.0, + "repeat_count": 0.0, + "routers_loss": 0.008712377399206161, + "skip_count": 1.0, + "step": 7820, + "text_loss": 0.19413328170776367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00017974042658153066, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12611001.0, + "repeat_count": 0.0, + "routers_loss": 0.007535711396485567, + "skip_count": 1.0, + "step": 7822, + "text_loss": 0.2672932744026184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001795027971632905, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12614584.0, + "repeat_count": 1.0, + "routers_loss": 0.006770546548068523, + "skip_count": 3.0, + "step": 7824, + "text_loss": 0.22805163264274597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00017926529056018297, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12617519.0, + "repeat_count": 0.0, + "routers_loss": 0.0010458873584866524, + "skip_count": 0.0, + "step": 7826, + "text_loss": 0.385499507188797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00017902790686322102, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12621566.0, + "repeat_count": 1.0, + "routers_loss": 0.00634258147329092, + "skip_count": 0.0, + "step": 7828, + "text_loss": 0.8044118285179138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00017879064616337076, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12624751.0, + "repeat_count": 0.0, + "routers_loss": 0.0053052278235554695, + "skip_count": 3.0, + "step": 7830, + "text_loss": 0.264322966337204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00017855350855155088, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12628478.0, + "repeat_count": 0.0, + "routers_loss": 0.0028291696216911077, + "skip_count": 0.0, + "step": 7832, + "text_loss": 0.20611460506916046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00017831649411863287, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12632027.0, + "repeat_count": 0.0, + "routers_loss": 0.0009586421074345708, + "skip_count": 1.0, + "step": 7834, + "text_loss": 0.4119716286659241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00017807960295544118, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12635144.0, + "repeat_count": 0.0, + "routers_loss": 0.012304541654884815, + "skip_count": 2.0, + "step": 7836, + "text_loss": 0.28647977113723755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001778428351527529, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12638719.0, + "repeat_count": 0.0, + "routers_loss": 0.005212076939642429, + "skip_count": 2.0, + "step": 7838, + "text_loss": 0.630459189414978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0001776061908012979, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12642119.0, + "repeat_count": 0.0, + "routers_loss": 0.00183707510586828, + "skip_count": 0.0, + "step": 7840, + "text_loss": 0.5905961990356445 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001773696699917588, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12645077.0, + "repeat_count": 1.0, + "routers_loss": 0.0058263009414076805, + "skip_count": 0.0, + "step": 7842, + "text_loss": 0.41949576139450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00017713327281477077, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12648964.0, + "repeat_count": 0.0, + "routers_loss": 0.001586507773026824, + "skip_count": 0.0, + "step": 7844, + "text_loss": 0.5048848390579224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00017689699936092163, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 12651934.0, + "repeat_count": 0.0, + "routers_loss": 0.002397194504737854, + "skip_count": 0.0, + "step": 7846, + "text_loss": 0.23879878222942352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.84531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001766608497207518, + "loss": 0.0054, + "macro_f1": 0.5492662787437439, + "num_tokens": 12654907.0, + "repeat_count": 0.0, + "routers_loss": 0.016742069274187088, + "skip_count": 2.0, + "step": 7848, + "text_loss": 0.23400072753429413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001764248239847544, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 12658765.0, + "repeat_count": 0.0, + "routers_loss": 0.007037387229502201, + "skip_count": 2.0, + "step": 7850, + "text_loss": 0.26165497303009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.017822265625, + "learning_rate": 0.00017618892224337463, + "loss": 0.0044, + "macro_f1": 0.5492662787437439, + "num_tokens": 12662024.0, + "repeat_count": 0.0, + "routers_loss": 0.017352160066366196, + "skip_count": 2.0, + "step": 7852, + "text_loss": 0.23813043534755707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017595314458701084, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12665751.0, + "repeat_count": 0.0, + "routers_loss": 0.005349365528672934, + "skip_count": 3.0, + "step": 7854, + "text_loss": 0.14920757710933685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00017571749110601337, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12668823.0, + "repeat_count": 0.0, + "routers_loss": 0.0037689812015742064, + "skip_count": 2.0, + "step": 7856, + "text_loss": 0.2198697030544281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017548196189068506, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12672367.0, + "repeat_count": 0.0, + "routers_loss": 0.0006363615393638611, + "skip_count": 0.0, + "step": 7858, + "text_loss": 0.5338839888572693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00017524655703128112, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12675217.0, + "repeat_count": 0.0, + "routers_loss": 0.002691479865461588, + "skip_count": 0.0, + "step": 7860, + "text_loss": 0.17463763058185577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00017501127661800908, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12678796.0, + "repeat_count": 0.0, + "routers_loss": 0.002262329449877143, + "skip_count": 0.0, + "step": 7862, + "text_loss": 0.4637797474861145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00017477612074102899, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12681631.0, + "repeat_count": 0.0, + "routers_loss": 0.00115531450137496, + "skip_count": 0.0, + "step": 7864, + "text_loss": 0.6089238524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00017454108949045295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12685647.0, + "repeat_count": 0.0, + "routers_loss": 0.00260268640704453, + "skip_count": 0.0, + "step": 7866, + "text_loss": 0.5876018404960632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00017430618295634514, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12688995.0, + "repeat_count": 0.0, + "routers_loss": 0.002731681102886796, + "skip_count": 0.0, + "step": 7868, + "text_loss": 0.35076001286506653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00017407140122872262, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12692100.0, + "repeat_count": 1.0, + "routers_loss": 0.003314645728096366, + "skip_count": 1.0, + "step": 7870, + "text_loss": 0.5313478112220764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.958027590255355, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017383674439755393, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12695117.0, + "repeat_count": 0.0, + "routers_loss": 0.010385016910731792, + "skip_count": 1.0, + "step": 7872, + "text_loss": 0.5092368125915527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00017360221255276016, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12697678.0, + "repeat_count": 0.0, + "routers_loss": 0.001273582922294736, + "skip_count": 0.0, + "step": 7874, + "text_loss": 0.5282881855964661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00017336780578421418, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12702132.0, + "repeat_count": 0.0, + "routers_loss": 0.0007510313298553228, + "skip_count": 0.0, + "step": 7876, + "text_loss": 0.49093571305274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001731335241817412, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12705413.0, + "repeat_count": 0.0, + "routers_loss": 0.005138787440955639, + "skip_count": 2.0, + "step": 7878, + "text_loss": 0.7503541111946106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001728993678351184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12708310.0, + "repeat_count": 2.0, + "routers_loss": 0.004379773512482643, + "skip_count": 0.0, + "step": 7880, + "text_loss": 0.5942456126213074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001726653368340747, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12711043.0, + "repeat_count": 0.0, + "routers_loss": 0.005271450616419315, + "skip_count": 2.0, + "step": 7882, + "text_loss": 0.348360538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00017243143126829163, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12714473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015764752170071006, + "skip_count": 1.0, + "step": 7884, + "text_loss": 0.45971861481666565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.000172197651227402, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12717832.0, + "repeat_count": 0.0, + "routers_loss": 0.00040649910806678236, + "skip_count": 0.0, + "step": 7886, + "text_loss": 0.5996841788291931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00017196399680099078, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12720479.0, + "repeat_count": 0.0, + "routers_loss": 0.00473182974383235, + "skip_count": 2.0, + "step": 7888, + "text_loss": 0.40346208214759827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00017173046807859483, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12723104.0, + "repeat_count": 0.0, + "routers_loss": 0.0020138369873166084, + "skip_count": 0.0, + "step": 7890, + "text_loss": 0.6878634095191956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.05165835045494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001714970651497027, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 12725967.0, + "repeat_count": 0.0, + "routers_loss": 0.008381367661058903, + "skip_count": 1.0, + "step": 7892, + "text_loss": 0.9161711931228638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00017126378810375498, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 12728819.0, + "repeat_count": 1.0, + "routers_loss": 0.0037658829241991043, + "skip_count": 0.0, + "step": 7894, + "text_loss": 0.4447716772556305 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00017103063703014372, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12731806.0, + "repeat_count": 0.0, + "routers_loss": 0.0022742559667676687, + "skip_count": 0.0, + "step": 7896, + "text_loss": 0.9140825867652893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00017079761201821298, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 12734649.0, + "repeat_count": 0.0, + "routers_loss": 0.002157264854758978, + "skip_count": 0.0, + "step": 7898, + "text_loss": 0.268303781747818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001705647131572583, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12737889.0, + "repeat_count": 1.0, + "routers_loss": 0.01064873393625021, + "skip_count": 1.0, + "step": 7900, + "text_loss": 0.36009490489959717 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017033194053652685, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12740821.0, + "repeat_count": 1.0, + "routers_loss": 0.0062920586206018925, + "skip_count": 0.0, + "step": 7902, + "text_loss": 0.5301805138587952 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.00017009929424521782, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 12743876.0, + "repeat_count": 1.0, + "routers_loss": 0.0033694824669510126, + "skip_count": 1.0, + "step": 7904, + "text_loss": 1.026949167251587 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.117405341943055, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00016986677437248155, + "loss": 0.0071, + "macro_f1": 0.8817967176437378, + "num_tokens": 12747623.0, + "repeat_count": 2.0, + "routers_loss": 0.05076088383793831, + "skip_count": 3.0, + "step": 7906, + "text_loss": 0.33465588092803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016963438100742014, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12751255.0, + "repeat_count": 0.0, + "routers_loss": 0.0005921403644606471, + "skip_count": 0.0, + "step": 7908, + "text_loss": 0.3498881757259369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00016940211423908713, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 12754297.0, + "repeat_count": 0.0, + "routers_loss": 0.004132566973567009, + "skip_count": 0.0, + "step": 7910, + "text_loss": 0.2874198853969574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0001691699741564876, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12756969.0, + "repeat_count": 0.0, + "routers_loss": 0.0024724705144762993, + "skip_count": 1.0, + "step": 7912, + "text_loss": 0.10593545436859131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016893796084857806, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12760261.0, + "repeat_count": 0.0, + "routers_loss": 0.002991671208292246, + "skip_count": 0.0, + "step": 7914, + "text_loss": 0.1331545114517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00016870607440426643, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12762971.0, + "repeat_count": 0.0, + "routers_loss": 0.0018167285015806556, + "skip_count": 0.0, + "step": 7916, + "text_loss": 0.496826171875 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00016847431491241207, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12765949.0, + "repeat_count": 1.0, + "routers_loss": 0.0033364067785441875, + "skip_count": 0.0, + "step": 7918, + "text_loss": 0.43522849678993225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001682426824618256, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 12769201.0, + "repeat_count": 0.0, + "routers_loss": 0.001313596498221159, + "skip_count": 0.0, + "step": 7920, + "text_loss": 0.8691539168357849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.19254476078662, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00016801117714126908, + "loss": 0.0108, + "macro_f1": 0.6603773832321167, + "num_tokens": 12773308.0, + "repeat_count": 1.0, + "routers_loss": 0.02579287625849247, + "skip_count": 1.0, + "step": 7922, + "text_loss": 0.275301069021225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00016777979903945568, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 12776166.0, + "repeat_count": 0.0, + "routers_loss": 0.010501758195459843, + "skip_count": 1.0, + "step": 7924, + "text_loss": 0.32124993205070496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001675485482450499, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12779965.0, + "repeat_count": 0.0, + "routers_loss": 0.0063389060087502, + "skip_count": 2.0, + "step": 7926, + "text_loss": 0.2527695894241333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00016731742484666774, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12783019.0, + "repeat_count": 0.0, + "routers_loss": 0.002796935848891735, + "skip_count": 0.0, + "step": 7928, + "text_loss": 0.18767669796943665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001670864289328759, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12786291.0, + "repeat_count": 0.0, + "routers_loss": 0.007973561994731426, + "skip_count": 2.0, + "step": 7930, + "text_loss": 0.29628485441207886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00016685556059219253, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 12789566.0, + "repeat_count": 4.0, + "routers_loss": 0.011405733413994312, + "skip_count": 6.0, + "step": 7932, + "text_loss": 0.16635073721408844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00016662481991308682, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12792533.0, + "repeat_count": 0.0, + "routers_loss": 0.0012368770549073815, + "skip_count": 1.0, + "step": 7934, + "text_loss": 0.4196353852748871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000166394206983979, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12795619.0, + "repeat_count": 0.0, + "routers_loss": 0.0036002211272716522, + "skip_count": 1.0, + "step": 7936, + "text_loss": 0.17559808492660522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00016616372189324035, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12799702.0, + "repeat_count": 1.0, + "routers_loss": 0.0039332108572125435, + "skip_count": 0.0, + "step": 7938, + "text_loss": 0.603410542011261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00016593336472919324, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12802704.0, + "repeat_count": 0.0, + "routers_loss": 0.0008303318754769862, + "skip_count": 0.0, + "step": 7940, + "text_loss": 0.5331749320030212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.28646903434106, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00016570313558011098, + "loss": 0.0058, + "macro_f1": 0.6601307392120361, + "num_tokens": 12805630.0, + "repeat_count": 1.0, + "routers_loss": 0.05092398822307587, + "skip_count": 2.0, + "step": 7942, + "text_loss": 0.17398510873317719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00016547303453421774, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12809065.0, + "repeat_count": 0.0, + "routers_loss": 0.0006886976188980043, + "skip_count": 0.0, + "step": 7944, + "text_loss": 0.3419797718524933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00016524306167968878, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12812641.0, + "repeat_count": 1.0, + "routers_loss": 0.005634502973407507, + "skip_count": 3.0, + "step": 7946, + "text_loss": 0.5877651572227478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00016501321710465005, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12815527.0, + "repeat_count": 0.0, + "routers_loss": 0.0020598487462848425, + "skip_count": 0.0, + "step": 7948, + "text_loss": 0.3558528423309326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001647835008971783, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12819103.0, + "repeat_count": 0.0, + "routers_loss": 0.005946476943790913, + "skip_count": 2.0, + "step": 7950, + "text_loss": 0.5800213813781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00016455391314530154, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12822423.0, + "repeat_count": 0.0, + "routers_loss": 0.010360358282923698, + "skip_count": 2.0, + "step": 7952, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00016432445393699802, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12826180.0, + "repeat_count": 0.0, + "routers_loss": 0.003017681185156107, + "skip_count": 0.0, + "step": 7954, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00016409512336019698, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12829196.0, + "repeat_count": 0.0, + "routers_loss": 0.0008854938205331564, + "skip_count": 0.0, + "step": 7956, + "text_loss": 0.2776578366756439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.00016386592150277834, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 12831983.0, + "repeat_count": 0.0, + "routers_loss": 0.0023990103509277105, + "skip_count": 0.0, + "step": 7958, + "text_loss": 0.46686989068984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 37.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001636368484525727, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 12834889.0, + "repeat_count": 0.0, + "routers_loss": 0.009835032746195793, + "skip_count": 5.0, + "step": 7960, + "text_loss": 0.22224856913089752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016340790429736118, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12837950.0, + "repeat_count": 0.0, + "routers_loss": 0.0018618656322360039, + "skip_count": 0.0, + "step": 7962, + "text_loss": 0.5101882815361023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00016317908912487578, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12840981.0, + "repeat_count": 1.0, + "routers_loss": 0.001275144051760435, + "skip_count": 1.0, + "step": 7964, + "text_loss": 0.40567103028297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016295040302279873, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12844044.0, + "repeat_count": 0.0, + "routers_loss": 0.003117429558187723, + "skip_count": 2.0, + "step": 7966, + "text_loss": 0.6888198852539062 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00016272184607876312, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12847350.0, + "repeat_count": 2.0, + "routers_loss": 0.006585797294974327, + "skip_count": 4.0, + "step": 7968, + "text_loss": 0.19813506305217743 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001624934183803523, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12850285.0, + "repeat_count": 1.0, + "routers_loss": 0.0043576788157224655, + "skip_count": 1.0, + "step": 7970, + "text_loss": 0.6108269691467285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.427355444672735, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00016226512001510024, + "loss": 0.0039, + "macro_f1": 0.5492662787437439, + "num_tokens": 12853993.0, + "repeat_count": 0.0, + "routers_loss": 0.011879517696797848, + "skip_count": 2.0, + "step": 7972, + "text_loss": 0.42478689551353455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00016203695107049117, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 12857022.0, + "repeat_count": 0.0, + "routers_loss": 0.0016375730047002435, + "skip_count": 0.0, + "step": 7974, + "text_loss": 0.5130020976066589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001618089116339601, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12860764.0, + "repeat_count": 0.0, + "routers_loss": 0.0006649247952736914, + "skip_count": 0.0, + "step": 7976, + "text_loss": 1.0629136562347412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.455532726739065, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00016158100179289208, + "loss": 0.0062, + "macro_f1": 0.6603773832321167, + "num_tokens": 12864066.0, + "repeat_count": 1.0, + "routers_loss": 0.03140667825937271, + "skip_count": 1.0, + "step": 7978, + "text_loss": 0.4241345226764679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 37.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001613532216346226, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12867555.0, + "repeat_count": 0.0, + "routers_loss": 0.010257012210786343, + "skip_count": 4.0, + "step": 7980, + "text_loss": 0.6085613369941711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001611255712464374, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 12871415.0, + "repeat_count": 0.0, + "routers_loss": 0.00783725269138813, + "skip_count": 1.0, + "step": 7982, + "text_loss": 0.15661844611167908 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 0.00016089805071557256, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12874195.0, + "repeat_count": 1.0, + "routers_loss": 0.0027650597039610147, + "skip_count": 2.0, + "step": 7984, + "text_loss": 0.4938865005970001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016067066012921439, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 12878084.0, + "repeat_count": 1.0, + "routers_loss": 0.04647083953022957, + "skip_count": 0.0, + "step": 7986, + "text_loss": 0.2973119020462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00016044339957449938, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12881182.0, + "repeat_count": 0.0, + "routers_loss": 0.002192265819758177, + "skip_count": 0.0, + "step": 7988, + "text_loss": 0.2623208165168762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00016021626913851418, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12884028.0, + "repeat_count": 0.0, + "routers_loss": 0.0023096329532563686, + "skip_count": 0.0, + "step": 7990, + "text_loss": 0.3752247989177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015998926890829562, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 12887759.0, + "repeat_count": 0.0, + "routers_loss": 0.03038526326417923, + "skip_count": 1.0, + "step": 7992, + "text_loss": 0.2609226405620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001597623989708306, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12890976.0, + "repeat_count": 0.0, + "routers_loss": 0.0015199477784335613, + "skip_count": 0.0, + "step": 7994, + "text_loss": 0.6512867212295532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00015953565941305615, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12894112.0, + "repeat_count": 0.0, + "routers_loss": 0.0024166766088455915, + "skip_count": 0.0, + "step": 7996, + "text_loss": 0.5539866089820862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0001593090503218591, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12896857.0, + "repeat_count": 1.0, + "routers_loss": 0.005081235896795988, + "skip_count": 2.0, + "step": 7998, + "text_loss": 0.6631022691726685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00015908257178407682, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12900075.0, + "repeat_count": 1.0, + "routers_loss": 0.0024711282458156347, + "skip_count": 0.0, + "step": 8000, + "text_loss": 0.3309785723686218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.5682418550044, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00015885622388649617, + "loss": 0.0059, + "macro_f1": 0.6601307392120361, + "num_tokens": 12903845.0, + "repeat_count": 1.0, + "routers_loss": 0.04024988412857056, + "skip_count": 2.0, + "step": 8002, + "text_loss": 0.2384071946144104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.00015863000671585405, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 12907694.0, + "repeat_count": 1.0, + "routers_loss": 0.001953886589035392, + "skip_count": 2.0, + "step": 8004, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00015840392035883726, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12910871.0, + "repeat_count": 0.0, + "routers_loss": 0.002982128644362092, + "skip_count": 2.0, + "step": 8006, + "text_loss": 0.2589346170425415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001581779649020827, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12914484.0, + "repeat_count": 0.0, + "routers_loss": 0.0009384988807141781, + "skip_count": 0.0, + "step": 8008, + "text_loss": 0.5727795362472534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00015795214043217654, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12917480.0, + "repeat_count": 0.0, + "routers_loss": 0.008854437619447708, + "skip_count": 2.0, + "step": 8010, + "text_loss": 0.24354904890060425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00015772644703565563, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 12920383.0, + "repeat_count": 0.0, + "routers_loss": 0.001689503900706768, + "skip_count": 0.0, + "step": 8012, + "text_loss": 0.5372336506843567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00015750088479900588, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12923886.0, + "repeat_count": 0.0, + "routers_loss": 0.002284591319039464, + "skip_count": 0.0, + "step": 8014, + "text_loss": 0.1708722710609436 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 37.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015727545380866316, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12926998.0, + "repeat_count": 1.0, + "routers_loss": 0.004594483878463507, + "skip_count": 4.0, + "step": 8016, + "text_loss": 0.26784324645996094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001570501541510131, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12929726.0, + "repeat_count": 1.0, + "routers_loss": 0.0021998141892254353, + "skip_count": 0.0, + "step": 8018, + "text_loss": 0.8051869869232178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00015682498591239086, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12932182.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623414881527424, + "skip_count": 1.0, + "step": 8020, + "text_loss": 0.8431181907653809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00015659994917908144, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12935338.0, + "repeat_count": 0.0, + "routers_loss": 0.0014909361489117146, + "skip_count": 1.0, + "step": 8022, + "text_loss": 0.6168642640113831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001563750440373191, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12938484.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295510292053223, + "skip_count": 0.0, + "step": 8024, + "text_loss": 0.2694014608860016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.68095098326974, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029296875, + "learning_rate": 0.00015615027057328828, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 12942045.0, + "repeat_count": 0.0, + "routers_loss": 0.018341995775699615, + "skip_count": 2.0, + "step": 8026, + "text_loss": 0.8151478171348572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 37.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0001559256288731224, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12945547.0, + "repeat_count": 2.0, + "routers_loss": 0.0023289949167519808, + "skip_count": 1.0, + "step": 8028, + "text_loss": 0.613464891910553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015570111902290463, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12949544.0, + "repeat_count": 0.0, + "routers_loss": 0.006635872647166252, + "skip_count": 2.0, + "step": 8030, + "text_loss": 0.17417465150356293 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00015547674110866756, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12952838.0, + "repeat_count": 1.0, + "routers_loss": 0.006023989990353584, + "skip_count": 1.0, + "step": 8032, + "text_loss": 0.4801837205886841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00015525249521639319, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 12956329.0, + "repeat_count": 0.0, + "routers_loss": 0.005706884432584047, + "skip_count": 0.0, + "step": 8034, + "text_loss": 0.2028084248304367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000155028381432013, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 12959122.0, + "repeat_count": 0.0, + "routers_loss": 0.003527123713865876, + "skip_count": 2.0, + "step": 8036, + "text_loss": 0.39474430680274963 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00015480439984140776, + "loss": 0.0029, + "macro_f1": 1.0, + "num_tokens": 12962546.0, + "repeat_count": 1.0, + "routers_loss": 0.010415437631309032, + "skip_count": 2.0, + "step": 8038, + "text_loss": 0.20412345230579376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0001545805505304077, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12965861.0, + "repeat_count": 0.0, + "routers_loss": 0.001566931139677763, + "skip_count": 0.0, + "step": 8040, + "text_loss": 0.5129821300506592 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 37.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001543568335847923, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12968677.0, + "repeat_count": 3.0, + "routers_loss": 0.0037196793127804995, + "skip_count": 0.0, + "step": 8042, + "text_loss": 0.755020260810852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00015413324909029031, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 12972001.0, + "repeat_count": 0.0, + "routers_loss": 0.0010940275387838483, + "skip_count": 0.0, + "step": 8044, + "text_loss": 0.48672133684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00015390979713257968, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12974765.0, + "repeat_count": 0.0, + "routers_loss": 0.011106903664767742, + "skip_count": 1.0, + "step": 8046, + "text_loss": 0.1727766990661621 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 37.78426768417963, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.048828125, + "learning_rate": 0.00015368647779728757, + "loss": 0.006, + "macro_f1": 0.886363685131073, + "num_tokens": 12979127.0, + "repeat_count": 3.0, + "routers_loss": 0.05134248360991478, + "skip_count": 6.0, + "step": 8048, + "text_loss": 0.33233317732810974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00015346329116999057, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12982812.0, + "repeat_count": 0.0, + "routers_loss": 0.0027500339783728123, + "skip_count": 0.0, + "step": 8050, + "text_loss": 0.8176849484443665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.80305253889052, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00015324023733621412, + "loss": 0.005, + "macro_f1": 0.32098764181137085, + "num_tokens": 12985740.0, + "repeat_count": 0.0, + "routers_loss": 0.030734945088624954, + "skip_count": 2.0, + "step": 8052, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00015301731638143285, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12988646.0, + "repeat_count": 0.0, + "routers_loss": 0.002358534839004278, + "skip_count": 2.0, + "step": 8054, + "text_loss": 0.5656245946884155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001527945283910705, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 12991518.0, + "repeat_count": 2.0, + "routers_loss": 0.007991814985871315, + "skip_count": 3.0, + "step": 8056, + "text_loss": 0.26438817381858826 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00015257187345049983, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 12994847.0, + "repeat_count": 1.0, + "routers_loss": 0.011761264875531197, + "skip_count": 1.0, + "step": 8058, + "text_loss": 0.1801673173904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 37.8406222483123, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0001523493516450427, + "loss": 0.004, + "macro_f1": 0.8823530077934265, + "num_tokens": 12997874.0, + "repeat_count": 1.0, + "routers_loss": 0.021669765934348106, + "skip_count": 2.0, + "step": 8060, + "text_loss": 0.3278379738330841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001521269630599698, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13000504.0, + "repeat_count": 0.0, + "routers_loss": 0.002388916676864028, + "skip_count": 0.0, + "step": 8062, + "text_loss": 0.5396623611450195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00015190470778050086, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 13003620.0, + "repeat_count": 0.0, + "routers_loss": 0.007719808723777533, + "skip_count": 1.0, + "step": 8064, + "text_loss": 0.1989232450723648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00015168258589180462, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 13007410.0, + "repeat_count": 0.0, + "routers_loss": 0.0007461659261025488, + "skip_count": 0.0, + "step": 8066, + "text_loss": 0.5293997526168823 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00015146059747899848, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13010240.0, + "repeat_count": 1.0, + "routers_loss": 0.005515575874596834, + "skip_count": 0.0, + "step": 8068, + "text_loss": 0.2776186466217041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00015123874262714892, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13012728.0, + "repeat_count": 0.0, + "routers_loss": 0.0026730166282504797, + "skip_count": 0.0, + "step": 8070, + "text_loss": 0.5902766585350037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.00015101702142127088, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13015616.0, + "repeat_count": 0.0, + "routers_loss": 0.002244985429570079, + "skip_count": 0.0, + "step": 8072, + "text_loss": 0.21447396278381348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015079543394632878, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13019846.0, + "repeat_count": 0.0, + "routers_loss": 0.001963787479326129, + "skip_count": 0.0, + "step": 8074, + "text_loss": 0.22974267601966858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.915761667155856, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.053955078125, + "learning_rate": 0.00015057398028723513, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 13023036.0, + "repeat_count": 0.0, + "routers_loss": 0.02271878905594349, + "skip_count": 2.0, + "step": 8076, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015035266052885137, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13025840.0, + "repeat_count": 0.0, + "routers_loss": 0.0011732397833839059, + "skip_count": 0.0, + "step": 8078, + "text_loss": 0.44129177927970886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0001501314747559877, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13030031.0, + "repeat_count": 1.0, + "routers_loss": 0.015655985102057457, + "skip_count": 2.0, + "step": 8080, + "text_loss": 0.28889161348342896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00014991042305340286, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13033603.0, + "repeat_count": 0.0, + "routers_loss": 0.0012988687958568335, + "skip_count": 0.0, + "step": 8082, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00014968950550580434, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13036931.0, + "repeat_count": 0.0, + "routers_loss": 0.002425852930173278, + "skip_count": 0.0, + "step": 8084, + "text_loss": 0.35900676250457764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001494687221978482, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13040637.0, + "repeat_count": 0.0, + "routers_loss": 0.004092676565051079, + "skip_count": 1.0, + "step": 8086, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014924807321413893, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13043855.0, + "repeat_count": 0.0, + "routers_loss": 0.0009040542645379901, + "skip_count": 0.0, + "step": 8088, + "text_loss": 0.30341213941574097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001490275586392296, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13046903.0, + "repeat_count": 0.0, + "routers_loss": 0.0019248841563239694, + "skip_count": 0.0, + "step": 8090, + "text_loss": 0.4299648702144623 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000148807178557622, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 13050219.0, + "repeat_count": 0.0, + "routers_loss": 0.0008314658771269023, + "skip_count": 0.0, + "step": 8092, + "text_loss": 0.4521652162075043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00014858693305376598, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13053076.0, + "repeat_count": 0.0, + "routers_loss": 0.0007470731507055461, + "skip_count": 0.0, + "step": 8094, + "text_loss": 0.46265852451324463 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00014836682221206, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13056170.0, + "repeat_count": 1.0, + "routers_loss": 0.003292408073320985, + "skip_count": 0.0, + "step": 8096, + "text_loss": 0.6483868956565857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00014814684611685124, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13059181.0, + "repeat_count": 0.0, + "routers_loss": 0.001357200788334012, + "skip_count": 0.0, + "step": 8098, + "text_loss": 0.43141183257102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00014792700485243476, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13062124.0, + "repeat_count": 0.0, + "routers_loss": 0.0030062920413911343, + "skip_count": 0.0, + "step": 8100, + "text_loss": 0.26022693514823914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001477072985030542, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13065273.0, + "repeat_count": 0.0, + "routers_loss": 0.0006919128354638815, + "skip_count": 0.0, + "step": 8102, + "text_loss": 0.5927232503890991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00014748772715290144, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13068346.0, + "repeat_count": 0.0, + "routers_loss": 0.005062389187514782, + "skip_count": 0.0, + "step": 8104, + "text_loss": 0.1255214959383011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00014726829088611664, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13071384.0, + "repeat_count": 0.0, + "routers_loss": 0.0005492564523592591, + "skip_count": 0.0, + "step": 8106, + "text_loss": 0.6445038914680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00014704898978678817, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13074667.0, + "repeat_count": 0.0, + "routers_loss": 0.002470226027071476, + "skip_count": 0.0, + "step": 8108, + "text_loss": 0.5019628405570984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014682982393895256, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13077566.0, + "repeat_count": 0.0, + "routers_loss": 0.0008262090268544853, + "skip_count": 0.0, + "step": 8110, + "text_loss": 0.6075460314750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00014661079342659467, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13081042.0, + "repeat_count": 0.0, + "routers_loss": 0.00034181721275672317, + "skip_count": 0.0, + "step": 8112, + "text_loss": 0.7349393963813782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001463918983336474, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 13084151.0, + "repeat_count": 1.0, + "routers_loss": 0.01406828872859478, + "skip_count": 2.0, + "step": 8114, + "text_loss": 0.3122454285621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.00014617313874399173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13086998.0, + "repeat_count": 0.0, + "routers_loss": 0.002714085392653942, + "skip_count": 0.0, + "step": 8116, + "text_loss": 0.6545852422714233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00014595451474145677, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13090017.0, + "repeat_count": 0.0, + "routers_loss": 0.0073202489875257015, + "skip_count": 0.0, + "step": 8118, + "text_loss": 0.5487201809883118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00014573602640981947, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13093651.0, + "repeat_count": 0.0, + "routers_loss": 0.000667977670673281, + "skip_count": 0.0, + "step": 8120, + "text_loss": 0.672166109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00014551767383280535, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13097139.0, + "repeat_count": 0.0, + "routers_loss": 0.0020584615413099527, + "skip_count": 0.0, + "step": 8122, + "text_loss": 0.1996239423751831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.14088641033167, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00014529945709408726, + "loss": 0.0069, + "macro_f1": 0.6598639488220215, + "num_tokens": 13100493.0, + "repeat_count": 1.0, + "routers_loss": 0.013855135068297386, + "skip_count": 3.0, + "step": 8124, + "text_loss": 0.4099486768245697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0001450813762772863, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13103488.0, + "repeat_count": 0.0, + "routers_loss": 0.0014984552981331944, + "skip_count": 0.0, + "step": 8126, + "text_loss": 0.6307108402252197 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00014486343146597152, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 13106445.0, + "repeat_count": 1.0, + "routers_loss": 0.00430954247713089, + "skip_count": 0.0, + "step": 8128, + "text_loss": 0.6226127743721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.00014464562274365972, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13109258.0, + "repeat_count": 0.0, + "routers_loss": 0.003711461555212736, + "skip_count": 1.0, + "step": 8130, + "text_loss": 0.17819052934646606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00014442795019381567, + "loss": 0.0064, + "macro_f1": 0.6603773832321167, + "num_tokens": 13114206.0, + "repeat_count": 1.0, + "routers_loss": 0.015719098970294, + "skip_count": 1.0, + "step": 8132, + "text_loss": 0.28450697660446167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00014421041389985184, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13117351.0, + "repeat_count": 0.0, + "routers_loss": 0.0013113922905176878, + "skip_count": 0.0, + "step": 8134, + "text_loss": 0.310830682516098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00014399301394512858, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 13120228.0, + "repeat_count": 1.0, + "routers_loss": 0.001965439412742853, + "skip_count": 1.0, + "step": 8136, + "text_loss": 0.8635116815567017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00014377575041295393, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13123380.0, + "repeat_count": 1.0, + "routers_loss": 0.004898902028799057, + "skip_count": 2.0, + "step": 8138, + "text_loss": 0.5302467346191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0001435586233865836, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13126875.0, + "repeat_count": 0.0, + "routers_loss": 0.00031845085322856903, + "skip_count": 0.0, + "step": 8140, + "text_loss": 0.5913560390472412 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0001433416329492213, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 13129563.0, + "repeat_count": 1.0, + "routers_loss": 0.00298812473192811, + "skip_count": 1.0, + "step": 8142, + "text_loss": 0.5153398513793945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014312477918401807, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13132608.0, + "repeat_count": 0.0, + "routers_loss": 0.0026608197949826717, + "skip_count": 1.0, + "step": 8144, + "text_loss": 0.4554155766963959 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014290806217407272, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13136204.0, + "repeat_count": 1.0, + "routers_loss": 0.0027651884593069553, + "skip_count": 1.0, + "step": 8146, + "text_loss": 0.6349515318870544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00014269148200243148, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13138895.0, + "repeat_count": 0.0, + "routers_loss": 0.0006579195614904165, + "skip_count": 0.0, + "step": 8148, + "text_loss": 0.4629364013671875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014247503875208846, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 13142500.0, + "repeat_count": 1.0, + "routers_loss": 0.023065708577632904, + "skip_count": 0.0, + "step": 8150, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00014225873250598496, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13146203.0, + "repeat_count": 0.0, + "routers_loss": 0.007397830951958895, + "skip_count": 1.0, + "step": 8152, + "text_loss": 0.3225953280925751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00014204256334700988, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 13149517.0, + "repeat_count": 0.0, + "routers_loss": 0.004839105997234583, + "skip_count": 1.0, + "step": 8154, + "text_loss": 0.18435558676719666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00014182653135799995, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13152643.0, + "repeat_count": 0.0, + "routers_loss": 0.0028303388971835375, + "skip_count": 4.0, + "step": 8156, + "text_loss": 0.5836900472640991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001416106366217389, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13155213.0, + "repeat_count": 0.0, + "routers_loss": 0.0004012314020656049, + "skip_count": 0.0, + "step": 8158, + "text_loss": 0.3723861575126648 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 20.0, + "epoch": 38.30995010272967, + "f1_execute": 0.9714285731315613, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0001413948792209579, + "loss": 0.0065, + "macro_f1": 0.8793651461601257, + "num_tokens": 13158440.0, + "repeat_count": 2.0, + "routers_loss": 0.04377155378460884, + "skip_count": 9.0, + "step": 8160, + "text_loss": 0.32476910948753357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001411792592383357, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13162651.0, + "repeat_count": 0.0, + "routers_loss": 0.0011163362069055438, + "skip_count": 0.0, + "step": 8162, + "text_loss": 0.4890389144420624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.32873495744057, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014096377675649823, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13165406.0, + "repeat_count": 1.0, + "routers_loss": 0.012117774225771427, + "skip_count": 1.0, + "step": 8164, + "text_loss": 0.7763246893882751 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.33812738479601, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00014074843185801883, + "loss": 0.004, + "macro_f1": 0.9262410998344421, + "num_tokens": 13168402.0, + "repeat_count": 3.0, + "routers_loss": 0.009951545856893063, + "skip_count": 2.0, + "step": 8166, + "text_loss": 0.5038266777992249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00014053322462541802, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13171423.0, + "repeat_count": 1.0, + "routers_loss": 0.0021372761111706495, + "skip_count": 1.0, + "step": 8168, + "text_loss": 0.5634724497795105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014031815514116354, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13174713.0, + "repeat_count": 0.0, + "routers_loss": 0.0007417177548632026, + "skip_count": 0.0, + "step": 8170, + "text_loss": 0.4009707272052765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 38.36630466686234, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014010322348767057, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 13178012.0, + "repeat_count": 0.0, + "routers_loss": 0.01619168184697628, + "skip_count": 3.0, + "step": 8172, + "text_loss": 0.29182371497154236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00013988842974730137, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13181096.0, + "repeat_count": 0.0, + "routers_loss": 0.0037969043478369713, + "skip_count": 0.0, + "step": 8174, + "text_loss": 0.275851845741272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00013967377400236515, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13184116.0, + "repeat_count": 0.0, + "routers_loss": 0.0007759644067846239, + "skip_count": 0.0, + "step": 8176, + "text_loss": 0.7569663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00013945925633511848, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13187319.0, + "repeat_count": 0.0, + "routers_loss": 0.002708743792027235, + "skip_count": 0.0, + "step": 8178, + "text_loss": 0.4733831286430359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00013924487682776492, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 13190796.0, + "repeat_count": 0.0, + "routers_loss": 0.0005060714902356267, + "skip_count": 0.0, + "step": 8180, + "text_loss": 0.5663171410560608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.413266803639566, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001390306355624551, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 13193705.0, + "repeat_count": 0.0, + "routers_loss": 0.02932601235806942, + "skip_count": 1.0, + "step": 8182, + "text_loss": 0.30700045824050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001388165326212867, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13196393.0, + "repeat_count": 0.0, + "routers_loss": 0.0011637522839009762, + "skip_count": 0.0, + "step": 8184, + "text_loss": 0.6897354125976562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00013860256808630427, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13199526.0, + "repeat_count": 0.0, + "routers_loss": 0.0017184355529025197, + "skip_count": 0.0, + "step": 8186, + "text_loss": 0.6246579885482788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00013838874203949954, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13202963.0, + "repeat_count": 0.0, + "routers_loss": 0.0026622721925377846, + "skip_count": 0.0, + "step": 8188, + "text_loss": 0.506066083908081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013817505456281099, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13207408.0, + "repeat_count": 0.0, + "routers_loss": 0.000543750764336437, + "skip_count": 0.0, + "step": 8190, + "text_loss": 0.5192428231239319 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001379615057381241, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13211073.0, + "repeat_count": 0.0, + "routers_loss": 0.0010060713393613696, + "skip_count": 0.0, + "step": 8192, + "text_loss": 0.5640166401863098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00013774809564727104, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13214203.0, + "repeat_count": 0.0, + "routers_loss": 0.005152868572622538, + "skip_count": 2.0, + "step": 8194, + "text_loss": 0.8643819689750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001375348243720312, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13217748.0, + "repeat_count": 0.0, + "routers_loss": 0.0017722113989293575, + "skip_count": 2.0, + "step": 8196, + "text_loss": 0.40500834584236145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001373216919941304, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 13221341.0, + "repeat_count": 1.0, + "routers_loss": 0.00999271310865879, + "skip_count": 3.0, + "step": 8198, + "text_loss": 0.2317391037940979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00013710869859524143, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13224288.0, + "repeat_count": 0.0, + "routers_loss": 0.0016836341237649322, + "skip_count": 0.0, + "step": 8200, + "text_loss": 0.31873467564582825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00013689584425698376, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13227342.0, + "repeat_count": 0.0, + "routers_loss": 0.002255793660879135, + "skip_count": 0.0, + "step": 8202, + "text_loss": 0.13513202965259552 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001366831290609235, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 13230912.0, + "repeat_count": 1.0, + "routers_loss": 0.0062925987876951694, + "skip_count": 4.0, + "step": 8204, + "text_loss": 0.3692396581172943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00013647055308857353, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13233961.0, + "repeat_count": 1.0, + "routers_loss": 0.0020471401512622833, + "skip_count": 0.0, + "step": 8206, + "text_loss": 0.5655510425567627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001362581164213934, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13237170.0, + "repeat_count": 0.0, + "routers_loss": 0.0009666495025157928, + "skip_count": 0.0, + "step": 8208, + "text_loss": 0.720582902431488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00013604581914078922, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13241020.0, + "repeat_count": 0.0, + "routers_loss": 0.0006306356517598033, + "skip_count": 0.0, + "step": 8210, + "text_loss": 0.5686481595039368 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.55415321397123, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00013583366132811374, + "loss": 0.0058, + "macro_f1": 0.5492662787437439, + "num_tokens": 13244491.0, + "repeat_count": 2.0, + "routers_loss": 0.016230134293437004, + "skip_count": 0.0, + "step": 8212, + "text_loss": 0.55678790807724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00013562164306466624, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13247551.0, + "repeat_count": 0.0, + "routers_loss": 0.003904943587258458, + "skip_count": 2.0, + "step": 8214, + "text_loss": 0.6521575450897217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00013540976443169244, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13250863.0, + "repeat_count": 0.0, + "routers_loss": 0.002239734400063753, + "skip_count": 1.0, + "step": 8216, + "text_loss": 0.29757481813430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00013519802551038452, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13254215.0, + "repeat_count": 0.0, + "routers_loss": 0.004978829529136419, + "skip_count": 2.0, + "step": 8218, + "text_loss": 0.30598193407058716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00013498642638188157, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13257269.0, + "repeat_count": 0.0, + "routers_loss": 0.0040260558016598225, + "skip_count": 0.0, + "step": 8220, + "text_loss": 0.39327144622802734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00013477496712726862, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13260573.0, + "repeat_count": 0.0, + "routers_loss": 0.002124674618244171, + "skip_count": 0.0, + "step": 8222, + "text_loss": 0.38342708349227905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00013456364782757718, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13263684.0, + "repeat_count": 0.0, + "routers_loss": 0.00087209593039006, + "skip_count": 0.0, + "step": 8224, + "text_loss": 0.6338301301002502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00013435246856378526, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13266879.0, + "repeat_count": 1.0, + "routers_loss": 0.003183641703799367, + "skip_count": 0.0, + "step": 8226, + "text_loss": 0.6073583364486694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.00013414142941681718, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13270679.0, + "repeat_count": 0.0, + "routers_loss": 0.001859338372014463, + "skip_count": 0.0, + "step": 8228, + "text_loss": 0.5427029132843018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0001339305304675435, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13273275.0, + "repeat_count": 0.0, + "routers_loss": 0.000655558833386749, + "skip_count": 0.0, + "step": 8230, + "text_loss": 0.29442915320396423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00013371977179678113, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13276205.0, + "repeat_count": 0.0, + "routers_loss": 0.0011499621905386448, + "skip_count": 0.0, + "step": 8232, + "text_loss": 0.5601125359535217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00013350915348529313, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13279242.0, + "repeat_count": 0.0, + "routers_loss": 0.0019823790062218904, + "skip_count": 0.0, + "step": 8234, + "text_loss": 0.43674135208129883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00013329867561378888, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 13282531.0, + "repeat_count": 0.0, + "routers_loss": 0.005772443953901529, + "skip_count": 3.0, + "step": 8236, + "text_loss": 0.4838809072971344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013308833826292395, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13286219.0, + "repeat_count": 0.0, + "routers_loss": 0.0038314659614115953, + "skip_count": 2.0, + "step": 8238, + "text_loss": 0.5002569556236267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 38.685647196947464, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.031005859375, + "learning_rate": 0.00013287814151329987, + "loss": 0.0075, + "macro_f1": 0.9452888369560242, + "num_tokens": 13290348.0, + "repeat_count": 1.0, + "routers_loss": 0.04819172993302345, + "skip_count": 4.0, + "step": 8240, + "text_loss": 0.3099883198738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00013266808544546438, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13293644.0, + "repeat_count": 0.0, + "routers_loss": 0.010334883816540241, + "skip_count": 2.0, + "step": 8242, + "text_loss": 0.17672912776470184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00013245817013991164, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 13296721.0, + "repeat_count": 0.0, + "routers_loss": 0.00162201386410743, + "skip_count": 0.0, + "step": 8244, + "text_loss": 0.7664286494255066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00013224839567708142, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13299704.0, + "repeat_count": 0.0, + "routers_loss": 0.0039452011696994305, + "skip_count": 0.0, + "step": 8246, + "text_loss": 0.1827820986509323 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 38.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00013203876213735972, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13302553.0, + "repeat_count": 1.0, + "routers_loss": 0.006701917387545109, + "skip_count": 7.0, + "step": 8248, + "text_loss": 0.6020278930664062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0001318292696010785, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13305875.0, + "repeat_count": 0.0, + "routers_loss": 0.00968079548329115, + "skip_count": 2.0, + "step": 8250, + "text_loss": 0.2693248987197876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00013161991814851571, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13309115.0, + "repeat_count": 2.0, + "routers_loss": 0.008890608325600624, + "skip_count": 2.0, + "step": 8252, + "text_loss": 0.6325297355651855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00013141070785989517, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13312219.0, + "repeat_count": 1.0, + "routers_loss": 0.00825794693082571, + "skip_count": 4.0, + "step": 8254, + "text_loss": 0.284396767616272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00013120163881538677, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 13315214.0, + "repeat_count": 0.0, + "routers_loss": 0.003378969384357333, + "skip_count": 1.0, + "step": 8256, + "text_loss": 0.20296992361545563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.77017904314646, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00013099271109510603, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13319117.0, + "repeat_count": 1.0, + "routers_loss": 0.0164186954498291, + "skip_count": 0.0, + "step": 8258, + "text_loss": 0.21940068900585175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0001307839247791145, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13321631.0, + "repeat_count": 0.0, + "routers_loss": 0.0053979759104549885, + "skip_count": 3.0, + "step": 8260, + "text_loss": 0.19442199170589447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00013057527994741946, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13324759.0, + "repeat_count": 0.0, + "routers_loss": 0.0024567479267716408, + "skip_count": 0.0, + "step": 8262, + "text_loss": 0.5528824925422668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001303667766799741, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13327554.0, + "repeat_count": 0.0, + "routers_loss": 0.002819873159751296, + "skip_count": 1.0, + "step": 8264, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00013015841505667703, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 13331838.0, + "repeat_count": 0.0, + "routers_loss": 0.0030280952341854572, + "skip_count": 1.0, + "step": 8266, + "text_loss": 0.5263079404830933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001299501951573731, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13334968.0, + "repeat_count": 0.0, + "routers_loss": 0.001774887670762837, + "skip_count": 4.0, + "step": 8268, + "text_loss": 0.47985130548477173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012974211706185247, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 13338052.0, + "repeat_count": 0.0, + "routers_loss": 0.007027842104434967, + "skip_count": 1.0, + "step": 8270, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00012953418084985107, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13341653.0, + "repeat_count": 0.0, + "routers_loss": 0.0026854060124605894, + "skip_count": 1.0, + "step": 8272, + "text_loss": 0.43156498670578003 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012932638660105038, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 13345173.0, + "repeat_count": 0.0, + "routers_loss": 0.0033325920812785625, + "skip_count": 0.0, + "step": 8274, + "text_loss": 0.1679086685180664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00012911873439507766, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13348635.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183287370949984, + "skip_count": 0.0, + "step": 8276, + "text_loss": 0.5907418131828308 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00012891122431150549, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13351120.0, + "repeat_count": 0.0, + "routers_loss": 0.0049970983527600765, + "skip_count": 1.0, + "step": 8278, + "text_loss": 0.5437678694725037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.00012870385642985222, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13353774.0, + "repeat_count": 0.0, + "routers_loss": 0.0027123154141008854, + "skip_count": 0.0, + "step": 8280, + "text_loss": 0.5742796659469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00012849663082958158, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13358236.0, + "repeat_count": 0.0, + "routers_loss": 0.0062842960469424725, + "skip_count": 0.0, + "step": 8282, + "text_loss": 0.2340863049030304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012828954759010265, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13360994.0, + "repeat_count": 0.0, + "routers_loss": 0.0006564505747519433, + "skip_count": 0.0, + "step": 8284, + "text_loss": 0.45432794094085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001280826067907705, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13363665.0, + "repeat_count": 0.0, + "routers_loss": 0.001298630959354341, + "skip_count": 0.0, + "step": 8286, + "text_loss": 0.7439755201339722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00012787580851088493, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 13367412.0, + "repeat_count": 0.0, + "routers_loss": 0.00464112963527441, + "skip_count": 0.0, + "step": 8288, + "text_loss": 0.2854461669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001276691528296916, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13370745.0, + "repeat_count": 0.0, + "routers_loss": 0.0006090773968026042, + "skip_count": 0.0, + "step": 8290, + "text_loss": 0.6663011312484741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00012746263982638123, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13373396.0, + "repeat_count": 0.0, + "routers_loss": 0.0038922233507037163, + "skip_count": 0.0, + "step": 8292, + "text_loss": 0.3858443796634674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012725626958009007, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13376172.0, + "repeat_count": 0.0, + "routers_loss": 0.0016941255889832973, + "skip_count": 0.0, + "step": 8294, + "text_loss": 0.4758119285106659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0001270500421698994, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13379002.0, + "repeat_count": 1.0, + "routers_loss": 0.001703770598396659, + "skip_count": 0.0, + "step": 8296, + "text_loss": 0.7464606165885925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00012684395767483626, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13382221.0, + "repeat_count": 0.0, + "routers_loss": 0.001474690856412053, + "skip_count": 1.0, + "step": 8298, + "text_loss": 0.37309199571609497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00012663801617387245, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13385276.0, + "repeat_count": 0.0, + "routers_loss": 0.004561704583466053, + "skip_count": 3.0, + "step": 8300, + "text_loss": 0.43284836411476135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 38.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00012643221774592518, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13388321.0, + "repeat_count": 2.0, + "routers_loss": 0.005136100109666586, + "skip_count": 1.0, + "step": 8302, + "text_loss": 0.669730007648468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00012622656246985675, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 13391222.0, + "repeat_count": 0.0, + "routers_loss": 0.0028521555941551924, + "skip_count": 0.0, + "step": 8304, + "text_loss": 0.16773155331611633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00012602105042447471, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13395297.0, + "repeat_count": 0.0, + "routers_loss": 0.0033424890134483576, + "skip_count": 2.0, + "step": 8306, + "text_loss": 0.1650846153497696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001258156816885316, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13398482.0, + "repeat_count": 0.0, + "routers_loss": 0.0012481207959353924, + "skip_count": 0.0, + "step": 8308, + "text_loss": 0.37225499749183655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00012561045634072515, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13402199.0, + "repeat_count": 0.0, + "routers_loss": 0.006243644282221794, + "skip_count": 3.0, + "step": 8310, + "text_loss": 0.16000206768512726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00012540537445969807, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13404950.0, + "repeat_count": 0.0, + "routers_loss": 0.004267443902790546, + "skip_count": 2.0, + "step": 8312, + "text_loss": 0.400174081325531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00012520043612403815, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13407883.0, + "repeat_count": 0.0, + "routers_loss": 0.005013707559555769, + "skip_count": 2.0, + "step": 8314, + "text_loss": 0.1331731230020523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00012499564141227798, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13410563.0, + "repeat_count": 1.0, + "routers_loss": 0.00463570561259985, + "skip_count": 0.0, + "step": 8316, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0001247909904028956, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 13413730.0, + "repeat_count": 1.0, + "routers_loss": 0.007066591177135706, + "skip_count": 1.0, + "step": 8318, + "text_loss": 0.8059925436973572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012458648317431348, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13416425.0, + "repeat_count": 0.0, + "routers_loss": 0.004210594110190868, + "skip_count": 3.0, + "step": 8320, + "text_loss": 0.6559522151947021 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001243821198048992, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13419851.0, + "repeat_count": 1.0, + "routers_loss": 0.005613257177174091, + "skip_count": 2.0, + "step": 8322, + "text_loss": 0.2783811688423157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012417790037296523, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 13422588.0, + "repeat_count": 0.0, + "routers_loss": 0.00233642989769578, + "skip_count": 1.0, + "step": 8324, + "text_loss": 0.7659147381782532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00012397382495676874, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13425275.0, + "repeat_count": 0.0, + "routers_loss": 0.0013295465614646673, + "skip_count": 0.0, + "step": 8326, + "text_loss": 0.5693745017051697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0001237698936345119, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13428314.0, + "repeat_count": 1.0, + "routers_loss": 0.005712272133678198, + "skip_count": 1.0, + "step": 8328, + "text_loss": 0.8581340909004211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012356610648434153, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13431453.0, + "repeat_count": 0.0, + "routers_loss": 0.0015835616504773498, + "skip_count": 0.0, + "step": 8330, + "text_loss": 0.1395341008901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012336246358434928, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13434566.0, + "repeat_count": 0.0, + "routers_loss": 0.0012973316479474306, + "skip_count": 0.0, + "step": 8332, + "text_loss": 0.7125005125999451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012315896501257145, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13438056.0, + "repeat_count": 0.0, + "routers_loss": 0.0005822008824907243, + "skip_count": 0.0, + "step": 8334, + "text_loss": 0.7730510234832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00012295561084698915, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13441390.0, + "repeat_count": 0.0, + "routers_loss": 0.00547185679897666, + "skip_count": 1.0, + "step": 8336, + "text_loss": 0.3927873373031616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000122752401165528, + "loss": 0.0022, + "macro_f1": 0.3333333432674408, + "num_tokens": 13443864.0, + "repeat_count": 0.0, + "routers_loss": 0.0011191967641934752, + "skip_count": 0.0, + "step": 8338, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012254933604605828, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13447070.0, + "repeat_count": 0.0, + "routers_loss": 0.0005196621641516685, + "skip_count": 0.0, + "step": 8340, + "text_loss": 0.5597847104072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00012234641556639508, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13450522.0, + "repeat_count": 0.0, + "routers_loss": 0.003857341594994068, + "skip_count": 2.0, + "step": 8342, + "text_loss": 0.14400488138198853 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00012214363980429793, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 13453578.0, + "repeat_count": 1.0, + "routers_loss": 0.006664265412837267, + "skip_count": 3.0, + "step": 8344, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.00012194100883747078, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 13456480.0, + "repeat_count": 0.0, + "routers_loss": 0.003549816319718957, + "skip_count": 0.0, + "step": 8346, + "text_loss": 0.21776801347732544 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00012173852274356217, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13459859.0, + "repeat_count": 1.0, + "routers_loss": 0.00446992926299572, + "skip_count": 3.0, + "step": 8348, + "text_loss": 0.1828736811876297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00012153618160016527, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13463104.0, + "repeat_count": 0.0, + "routers_loss": 0.0024826989974826574, + "skip_count": 1.0, + "step": 8350, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001213339854848175, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 13467051.0, + "repeat_count": 0.0, + "routers_loss": 0.0021385846193879843, + "skip_count": 1.0, + "step": 8352, + "text_loss": 0.49281737208366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00012113193447500081, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13470411.0, + "repeat_count": 0.0, + "routers_loss": 0.0014382716035470366, + "skip_count": 1.0, + "step": 8354, + "text_loss": 0.5984349846839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00012093002864814151, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13474666.0, + "repeat_count": 0.0, + "routers_loss": 0.008536498062312603, + "skip_count": 1.0, + "step": 8356, + "text_loss": 0.2851131856441498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00012072826808161036, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13477754.0, + "repeat_count": 0.0, + "routers_loss": 0.0027286717668175697, + "skip_count": 0.0, + "step": 8358, + "text_loss": 0.5987376570701599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001205266528527223, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13481151.0, + "repeat_count": 0.0, + "routers_loss": 0.002780565759167075, + "skip_count": 1.0, + "step": 8360, + "text_loss": 0.1847199648618698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00012032518303873674, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13484050.0, + "repeat_count": 0.0, + "routers_loss": 0.0006186611135490239, + "skip_count": 0.0, + "step": 8362, + "text_loss": 0.6229772567749023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 39.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00012012385871685716, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13488551.0, + "repeat_count": 0.0, + "routers_loss": 0.00956071075052023, + "skip_count": 5.0, + "step": 8364, + "text_loss": 0.2810790538787842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00011992267996423162, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13491420.0, + "repeat_count": 0.0, + "routers_loss": 0.008410792797803879, + "skip_count": 2.0, + "step": 8366, + "text_loss": 0.20509617030620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00011972164685795212, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 13494736.0, + "repeat_count": 0.0, + "routers_loss": 0.00762166129425168, + "skip_count": 1.0, + "step": 8368, + "text_loss": 0.24739402532577515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.295861461696504, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011952075947505486, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 13498363.0, + "repeat_count": 0.0, + "routers_loss": 0.010674391873180866, + "skip_count": 1.0, + "step": 8370, + "text_loss": 0.31931644678115845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 39.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0001193200178925204, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 13501029.0, + "repeat_count": 2.0, + "routers_loss": 0.0041843741200864315, + "skip_count": 1.0, + "step": 8372, + "text_loss": 0.5103049278259277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00011911942218727312, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13503854.0, + "repeat_count": 0.0, + "routers_loss": 0.0006344785797409713, + "skip_count": 0.0, + "step": 8374, + "text_loss": 0.4914432764053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00011891897243618183, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13508316.0, + "repeat_count": 0.0, + "routers_loss": 0.0003527739318087697, + "skip_count": 0.0, + "step": 8376, + "text_loss": 0.5317551493644714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00011871866871605913, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13512603.0, + "repeat_count": 0.0, + "routers_loss": 0.001071247854270041, + "skip_count": 0.0, + "step": 8378, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00011851851110366185, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 13515928.0, + "repeat_count": 0.0, + "routers_loss": 0.000924977008253336, + "skip_count": 1.0, + "step": 8380, + "text_loss": 0.8004939556121826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0001183184996756908, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13518548.0, + "repeat_count": 0.0, + "routers_loss": 0.0017637151759117842, + "skip_count": 0.0, + "step": 8382, + "text_loss": 0.5012105107307434 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00011811863450879063, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 13522155.0, + "repeat_count": 2.0, + "routers_loss": 0.0011129514314234257, + "skip_count": 0.0, + "step": 8384, + "text_loss": 0.3866073489189148 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.371000880540066, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00011791891567955009, + "loss": 0.0046, + "macro_f1": 0.8814815282821655, + "num_tokens": 13525352.0, + "repeat_count": 2.0, + "routers_loss": 0.042801812291145325, + "skip_count": 4.0, + "step": 8386, + "text_loss": 0.18817944824695587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.00011771934326450173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13528537.0, + "repeat_count": 0.0, + "routers_loss": 0.0006869474309496582, + "skip_count": 0.0, + "step": 8388, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00011751991734012229, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13531650.0, + "repeat_count": 0.0, + "routers_loss": 0.0008001072565093637, + "skip_count": 0.0, + "step": 8390, + "text_loss": 0.5149344205856323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00011732063798283204, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13535071.0, + "repeat_count": 0.0, + "routers_loss": 0.0006921148742549121, + "skip_count": 0.0, + "step": 8392, + "text_loss": 0.5906356573104858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00011712150526899523, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 13537741.0, + "repeat_count": 0.0, + "routers_loss": 0.005221226718276739, + "skip_count": 2.0, + "step": 8394, + "text_loss": 0.3381146192550659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00011692251927491987, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 13541189.0, + "repeat_count": 1.0, + "routers_loss": 0.0023983579594641924, + "skip_count": 1.0, + "step": 8396, + "text_loss": 0.7345486283302307 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00011672368007685774, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 13545210.0, + "repeat_count": 1.0, + "routers_loss": 0.005362956319004297, + "skip_count": 2.0, + "step": 8398, + "text_loss": 0.6522865295410156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00011652498775100445, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13548260.0, + "repeat_count": 0.0, + "routers_loss": 0.002955642296001315, + "skip_count": 0.0, + "step": 8400, + "text_loss": 0.3200102150440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00011632644237349927, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13551519.0, + "repeat_count": 0.0, + "routers_loss": 0.001079231034964323, + "skip_count": 0.0, + "step": 8402, + "text_loss": 0.7251807451248169 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00011612804402042509, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13555241.0, + "repeat_count": 1.0, + "routers_loss": 0.013860360719263554, + "skip_count": 0.0, + "step": 8404, + "text_loss": 0.159539595246315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 39.46492515409451, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.054931640625, + "learning_rate": 0.00011592979276780857, + "loss": 0.0055, + "macro_f1": 0.9555556178092957, + "num_tokens": 13558389.0, + "repeat_count": 1.0, + "routers_loss": 0.017025530338287354, + "skip_count": 5.0, + "step": 8406, + "text_loss": 0.5154430270195007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011573168869162004, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 13561237.0, + "repeat_count": 1.0, + "routers_loss": 0.007349071092903614, + "skip_count": 2.0, + "step": 8408, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00011553373186777327, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13564080.0, + "repeat_count": 1.0, + "routers_loss": 0.003303215140476823, + "skip_count": 2.0, + "step": 8410, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00011533592237212558, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 13566649.0, + "repeat_count": 0.0, + "routers_loss": 0.005856195464730263, + "skip_count": 1.0, + "step": 8412, + "text_loss": 0.28037169575691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001151382602804782, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13570015.0, + "repeat_count": 0.0, + "routers_loss": 0.0007515792385675013, + "skip_count": 0.0, + "step": 8414, + "text_loss": 0.8517835736274719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00011494074566857549, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13573262.0, + "repeat_count": 0.0, + "routers_loss": 0.0043421462178230286, + "skip_count": 0.0, + "step": 8416, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00011474337861210544, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13576104.0, + "repeat_count": 1.0, + "routers_loss": 0.0108594736084342, + "skip_count": 2.0, + "step": 8418, + "text_loss": 0.4724268317222595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.53067214558262, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00011454615918669948, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 13579138.0, + "repeat_count": 1.0, + "routers_loss": 0.04178442806005478, + "skip_count": 0.0, + "step": 8420, + "text_loss": 0.4065103530883789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00011434908746793238, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13582818.0, + "repeat_count": 0.0, + "routers_loss": 0.004756448790431023, + "skip_count": 2.0, + "step": 8422, + "text_loss": 0.2932167947292328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00011415216353132252, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13586261.0, + "repeat_count": 0.0, + "routers_loss": 0.0033427432645112276, + "skip_count": 1.0, + "step": 8424, + "text_loss": 0.47670233249664307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0001139553874523313, + "loss": 0.003, + "macro_f1": 0.6666666865348816, + "num_tokens": 13589765.0, + "repeat_count": 0.0, + "routers_loss": 0.006597383879125118, + "skip_count": 1.0, + "step": 8426, + "text_loss": 0.31448885798454285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.5682418550044, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.00011375875930636403, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13592741.0, + "repeat_count": 0.0, + "routers_loss": 0.011398134753108025, + "skip_count": 1.0, + "step": 8428, + "text_loss": 0.17429469525814056 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.00011356227916876877, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13595763.0, + "repeat_count": 1.0, + "routers_loss": 0.0038021153304725885, + "skip_count": 0.0, + "step": 8430, + "text_loss": 0.6043882966041565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00011336594711483712, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13598274.0, + "repeat_count": 0.0, + "routers_loss": 0.00044314167462289333, + "skip_count": 0.0, + "step": 8432, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00011316976321980388, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13601510.0, + "repeat_count": 0.0, + "routers_loss": 0.001956664025783539, + "skip_count": 0.0, + "step": 8434, + "text_loss": 0.48483794927597046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0001129737275588471, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13604410.0, + "repeat_count": 0.0, + "routers_loss": 0.005170237272977829, + "skip_count": 0.0, + "step": 8436, + "text_loss": 0.21759741008281708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011277784020708803, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13607207.0, + "repeat_count": 1.0, + "routers_loss": 0.002223948948085308, + "skip_count": 2.0, + "step": 8438, + "text_loss": 0.6877034306526184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00011258210123959089, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13610981.0, + "repeat_count": 0.0, + "routers_loss": 0.0017733481945469975, + "skip_count": 1.0, + "step": 8440, + "text_loss": 0.7250658273696899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00011238651073136358, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 13614194.0, + "repeat_count": 1.0, + "routers_loss": 0.00155889883171767, + "skip_count": 1.0, + "step": 8442, + "text_loss": 0.6742649078369141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00011219106875735652, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13618011.0, + "repeat_count": 0.0, + "routers_loss": 0.0011234934208914638, + "skip_count": 0.0, + "step": 8444, + "text_loss": 0.8105526566505432 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.65277370120341, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00011199577539246347, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13621852.0, + "repeat_count": 1.0, + "routers_loss": 0.02346695400774479, + "skip_count": 1.0, + "step": 8446, + "text_loss": 0.22664032876491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001118006307115213, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 13624711.0, + "repeat_count": 0.0, + "routers_loss": 0.012819754891097546, + "skip_count": 2.0, + "step": 8448, + "text_loss": 0.31696105003356934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00011160563478930969, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13627561.0, + "repeat_count": 0.0, + "routers_loss": 0.0060531035996973515, + "skip_count": 2.0, + "step": 8450, + "text_loss": 0.2935826778411865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00011141078770055152, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13630445.0, + "repeat_count": 0.0, + "routers_loss": 0.004288572818040848, + "skip_count": 0.0, + "step": 8452, + "text_loss": 0.5720692873001099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00011121608951991252, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13633496.0, + "repeat_count": 0.0, + "routers_loss": 0.005682424642145634, + "skip_count": 1.0, + "step": 8454, + "text_loss": 0.28466710448265076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00011102154032200146, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13635938.0, + "repeat_count": 0.0, + "routers_loss": 0.0009555552969686687, + "skip_count": 0.0, + "step": 8456, + "text_loss": 0.47744694352149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00011082714018136985, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13638863.0, + "repeat_count": 0.0, + "routers_loss": 0.0023627313785254955, + "skip_count": 0.0, + "step": 8458, + "text_loss": 0.5212090611457825 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00011063288917251235, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13641874.0, + "repeat_count": 1.0, + "routers_loss": 0.00791920255869627, + "skip_count": 2.0, + "step": 8460, + "text_loss": 0.31359919905662537 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00011043878736986607, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13644970.0, + "repeat_count": 1.0, + "routers_loss": 0.0033252311404794455, + "skip_count": 1.0, + "step": 8462, + "text_loss": 0.33621230721473694 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00011024483484781144, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13648103.0, + "repeat_count": 1.0, + "routers_loss": 0.005567418877035379, + "skip_count": 2.0, + "step": 8464, + "text_loss": 0.48708856105804443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011005103168067143, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13651085.0, + "repeat_count": 0.0, + "routers_loss": 0.00047958645154722035, + "skip_count": 0.0, + "step": 8466, + "text_loss": 0.4151248633861542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00010985737794271161, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13654175.0, + "repeat_count": 0.0, + "routers_loss": 0.0009806647431105375, + "skip_count": 0.0, + "step": 8468, + "text_loss": 0.7322396039962769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00010966387370814057, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13657058.0, + "repeat_count": 0.0, + "routers_loss": 0.0009820344857871532, + "skip_count": 0.0, + "step": 8470, + "text_loss": 0.6350769400596619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010947051905110945, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13660203.0, + "repeat_count": 2.0, + "routers_loss": 0.002065197564661503, + "skip_count": 0.0, + "step": 8472, + "text_loss": 0.6025850176811218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010927731404571211, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13664021.0, + "repeat_count": 0.0, + "routers_loss": 0.0009939799783751369, + "skip_count": 0.0, + "step": 8474, + "text_loss": 0.3040087819099426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0001090842587659851, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13667055.0, + "repeat_count": 0.0, + "routers_loss": 0.0008282510680146515, + "skip_count": 0.0, + "step": 8476, + "text_loss": 0.7306531667709351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001088913532859076, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13669940.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349589770659804, + "skip_count": 0.0, + "step": 8478, + "text_loss": 0.32041916251182556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010869859767940133, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13672955.0, + "repeat_count": 0.0, + "routers_loss": 0.0007435405277647078, + "skip_count": 0.0, + "step": 8480, + "text_loss": 0.5343614816665649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00010850599202033051, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13676173.0, + "repeat_count": 0.0, + "routers_loss": 0.002763360273092985, + "skip_count": 0.0, + "step": 8482, + "text_loss": 0.6071668267250061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010831353638250213, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13680121.0, + "repeat_count": 0.0, + "routers_loss": 0.00202178000472486, + "skip_count": 0.0, + "step": 8484, + "text_loss": 0.42487844824790955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010812123083966535, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13683504.0, + "repeat_count": 0.0, + "routers_loss": 0.0056348275393247604, + "skip_count": 1.0, + "step": 8486, + "text_loss": 0.17678795754909515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010792907546551229, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13686870.0, + "repeat_count": 0.0, + "routers_loss": 0.003331703832373023, + "skip_count": 0.0, + "step": 8488, + "text_loss": 0.32238465547561646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00010773707033367708, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13690429.0, + "repeat_count": 0.0, + "routers_loss": 0.0011620528530329466, + "skip_count": 0.0, + "step": 8490, + "text_loss": 0.4141998291015625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00010754521551773655, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 13693747.0, + "repeat_count": 1.0, + "routers_loss": 0.005236583761870861, + "skip_count": 0.0, + "step": 8492, + "text_loss": 0.557283878326416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 39.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00010735351109120972, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13696837.0, + "repeat_count": 0.0, + "routers_loss": 0.005507425405085087, + "skip_count": 6.0, + "step": 8494, + "text_loss": 0.7394861578941345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00010716195712755821, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13700080.0, + "repeat_count": 0.0, + "routers_loss": 0.0008621517335996032, + "skip_count": 0.0, + "step": 8496, + "text_loss": 0.7079368233680725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010697055370018572, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13704088.0, + "repeat_count": 0.0, + "routers_loss": 0.0004489862476475537, + "skip_count": 0.0, + "step": 8498, + "text_loss": 0.5672308206558228 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00010677930088243847, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13707391.0, + "repeat_count": 1.0, + "routers_loss": 0.009171495214104652, + "skip_count": 2.0, + "step": 8500, + "text_loss": 0.6851600408554077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00010658819874760495, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13711238.0, + "repeat_count": 0.0, + "routers_loss": 0.0016714727971702814, + "skip_count": 1.0, + "step": 8502, + "text_loss": 0.7102733850479126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00010639724736891576, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13714553.0, + "repeat_count": 0.0, + "routers_loss": 0.0012916292762383819, + "skip_count": 0.0, + "step": 8504, + "text_loss": 0.4234752953052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0001062064468195439, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13718046.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265420186333358, + "skip_count": 0.0, + "step": 8506, + "text_loss": 0.5576326251029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001060157971726045, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13720687.0, + "repeat_count": 0.0, + "routers_loss": 0.0023503501433879137, + "skip_count": 1.0, + "step": 8508, + "text_loss": 0.5259605646133423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00010582529850115469, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13723946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593657355755568, + "skip_count": 0.0, + "step": 8510, + "text_loss": 0.3795129954814911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.00010563495087819419, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13727589.0, + "repeat_count": 0.0, + "routers_loss": 0.0005672222469002008, + "skip_count": 0.0, + "step": 8512, + "text_loss": 0.685897946357727 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.972116231288524, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00010544475437666445, + "loss": 0.0049, + "macro_f1": 0.9262410998344421, + "num_tokens": 13730579.0, + "repeat_count": 3.0, + "routers_loss": 0.01708158478140831, + "skip_count": 2.0, + "step": 8514, + "text_loss": 0.8044925332069397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00010525470906944917, + "loss": 0.0113, + "macro_f1": 1.0, + "num_tokens": 13733563.0, + "repeat_count": 1.0, + "routers_loss": 0.010253295302391052, + "skip_count": 2.0, + "step": 8516, + "text_loss": 0.3999447524547577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00010506481502937398, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13736645.0, + "repeat_count": 0.0, + "routers_loss": 0.004293019883334637, + "skip_count": 0.0, + "step": 8518, + "text_loss": 0.3128681778907776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00010487507232920674, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13740080.0, + "repeat_count": 1.0, + "routers_loss": 0.0030790462624281645, + "skip_count": 1.0, + "step": 8520, + "text_loss": 0.39142900705337524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00010468548104165709, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13743085.0, + "repeat_count": 0.0, + "routers_loss": 0.0007342757890000939, + "skip_count": 0.0, + "step": 8522, + "text_loss": 0.7652465105056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.00010449604123937689, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13746513.0, + "repeat_count": 0.0, + "routers_loss": 0.0030496022664010525, + "skip_count": 0.0, + "step": 8524, + "text_loss": 0.6259746551513672 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010430675299495973, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13749391.0, + "repeat_count": 1.0, + "routers_loss": 0.010060965083539486, + "skip_count": 1.0, + "step": 8526, + "text_loss": 0.2266668826341629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0001041176163809413, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13752449.0, + "repeat_count": 1.0, + "routers_loss": 0.002234962536022067, + "skip_count": 2.0, + "step": 8528, + "text_loss": 0.9742465019226074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00010392863146979903, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13755572.0, + "repeat_count": 0.0, + "routers_loss": 0.0003572004789020866, + "skip_count": 0.0, + "step": 8530, + "text_loss": 0.5757357478141785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010373979833395242, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 13759198.0, + "repeat_count": 0.0, + "routers_loss": 0.011161680333316326, + "skip_count": 0.0, + "step": 8532, + "text_loss": 0.6268131136894226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00010355111704576236, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13761914.0, + "repeat_count": 0.0, + "routers_loss": 0.002053353004157543, + "skip_count": 0.0, + "step": 8534, + "text_loss": 0.22388778626918793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00010336258767753232, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13765371.0, + "repeat_count": 0.0, + "routers_loss": 0.003634720342233777, + "skip_count": 2.0, + "step": 8536, + "text_loss": 0.5802993178367615 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.084531846199, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00010317421030150692, + "loss": 0.0072, + "macro_f1": 0.9539539813995361, + "num_tokens": 13768276.0, + "repeat_count": 5.0, + "routers_loss": 0.053806692361831665, + "skip_count": 5.0, + "step": 8538, + "text_loss": 0.10888377577066422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.00010298598498987266, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13772369.0, + "repeat_count": 0.0, + "routers_loss": 0.00501362606883049, + "skip_count": 1.0, + "step": 8540, + "text_loss": 0.5794995427131653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00010279791181475795, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 13776595.0, + "repeat_count": 1.0, + "routers_loss": 0.002230882178992033, + "skip_count": 2.0, + "step": 8542, + "text_loss": 0.5503702163696289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00010260999084823264, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13779993.0, + "repeat_count": 0.0, + "routers_loss": 0.0012205395614728332, + "skip_count": 0.0, + "step": 8544, + "text_loss": 0.7248672842979431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00010242222216230856, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13782683.0, + "repeat_count": 0.0, + "routers_loss": 0.0003966465883422643, + "skip_count": 0.0, + "step": 8546, + "text_loss": 0.7446619272232056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00010223460582893889, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13785534.0, + "repeat_count": 0.0, + "routers_loss": 0.004968565888702869, + "skip_count": 1.0, + "step": 8548, + "text_loss": 0.22457796335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00010204714192001863, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13788608.0, + "repeat_count": 0.0, + "routers_loss": 0.0033054195810109377, + "skip_count": 2.0, + "step": 8550, + "text_loss": 0.418837308883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00010185983050738434, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13791553.0, + "repeat_count": 0.0, + "routers_loss": 0.001166256028227508, + "skip_count": 0.0, + "step": 8552, + "text_loss": 0.4060337543487549 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00010167267166281402, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13795304.0, + "repeat_count": 0.0, + "routers_loss": 0.003844029037281871, + "skip_count": 2.0, + "step": 8554, + "text_loss": 0.17412975430488586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00010148566545802718, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13798445.0, + "repeat_count": 0.0, + "routers_loss": 0.0033507589250802994, + "skip_count": 0.0, + "step": 8556, + "text_loss": 0.24744336307048798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00010129881196468527, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13801338.0, + "repeat_count": 0.0, + "routers_loss": 0.004076482728123665, + "skip_count": 0.0, + "step": 8558, + "text_loss": 0.6542767882347107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00010111211125439069, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 13804157.0, + "repeat_count": 0.0, + "routers_loss": 0.0005654391716234386, + "skip_count": 0.0, + "step": 8560, + "text_loss": 0.527079701423645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00010092556339868758, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13807411.0, + "repeat_count": 0.0, + "routers_loss": 0.004915264435112476, + "skip_count": 1.0, + "step": 8562, + "text_loss": 0.721017599105835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010073916846906139, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13810489.0, + "repeat_count": 0.0, + "routers_loss": 0.005571382585912943, + "skip_count": 1.0, + "step": 8564, + "text_loss": 0.5802517533302307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00010055292653693903, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13813526.0, + "repeat_count": 0.0, + "routers_loss": 0.001321605988778174, + "skip_count": 0.0, + "step": 8566, + "text_loss": 0.5485247373580933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00010036683767368859, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 13817225.0, + "repeat_count": 0.0, + "routers_loss": 0.001876185997389257, + "skip_count": 0.0, + "step": 8568, + "text_loss": 0.08957820385694504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00010018090195061997, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13820667.0, + "repeat_count": 0.0, + "routers_loss": 0.004593426361680031, + "skip_count": 0.0, + "step": 8570, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.999511943898398e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0022372701205313206, + "skip_count": 0.0, + "step": 8572, + "text_loss": 0.20976831018924713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.980949020997276e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13827623.0, + "repeat_count": 0.0, + "routers_loss": 0.0030519715510308743, + "skip_count": 0.0, + "step": 8574, + "text_loss": 0.7638732194900513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.962401433471985e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13831013.0, + "repeat_count": 0.0, + "routers_loss": 0.005036211106926203, + "skip_count": 1.0, + "step": 8576, + "text_loss": 0.3791790306568146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.943869188429989e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13833611.0, + "repeat_count": 0.0, + "routers_loss": 0.002071794355288148, + "skip_count": 2.0, + "step": 8578, + "text_loss": 0.5480846166610718 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 9.925352292972884e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13836678.0, + "repeat_count": 1.0, + "routers_loss": 0.008119060657918453, + "skip_count": 0.0, + "step": 8580, + "text_loss": 0.21605457365512848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 9.906850754196379e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13839255.0, + "repeat_count": 0.0, + "routers_loss": 0.004017427563667297, + "skip_count": 2.0, + "step": 8582, + "text_loss": 0.4473285973072052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 9.888364579190285e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13842034.0, + "repeat_count": 0.0, + "routers_loss": 0.005163116846233606, + "skip_count": 1.0, + "step": 8584, + "text_loss": 0.21627424657344818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.869893775038557e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13844648.0, + "repeat_count": 0.0, + "routers_loss": 0.0044358340092003345, + "skip_count": 1.0, + "step": 8586, + "text_loss": 0.5660704970359802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.851438348819247e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13847629.0, + "repeat_count": 0.0, + "routers_loss": 0.00038135924842208624, + "skip_count": 1.0, + "step": 8588, + "text_loss": 0.6401235461235046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 9.832998307604495e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13851409.0, + "repeat_count": 0.0, + "routers_loss": 0.004005341790616512, + "skip_count": 1.0, + "step": 8590, + "text_loss": 0.43975043296813965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 9.814573658460562e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13854031.0, + "repeat_count": 0.0, + "routers_loss": 0.006872966885566711, + "skip_count": 2.0, + "step": 8592, + "text_loss": 0.6000451445579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.796164408447811e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 13856813.0, + "repeat_count": 0.0, + "routers_loss": 0.0019872859120368958, + "skip_count": 0.0, + "step": 8594, + "text_loss": 0.6026073098182678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 9.777770564620698e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13859805.0, + "repeat_count": 0.0, + "routers_loss": 0.013098123483359814, + "skip_count": 2.0, + "step": 8596, + "text_loss": 0.3294500708580017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 9.759392134027783e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13863119.0, + "repeat_count": 1.0, + "routers_loss": 0.001011171261779964, + "skip_count": 1.0, + "step": 8598, + "text_loss": 0.4078965187072754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.741029123711708e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13866239.0, + "repeat_count": 0.0, + "routers_loss": 0.003267963184043765, + "skip_count": 0.0, + "step": 8600, + "text_loss": 0.5064641833305359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.385089521573235, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 9.722681540709228e-05, + "loss": 0.0045, + "macro_f1": 0.6601307392120361, + "num_tokens": 13869647.0, + "repeat_count": 1.0, + "routers_loss": 0.02431299351155758, + "skip_count": 2.0, + "step": 8602, + "text_loss": 0.2512950301170349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 9.704349392051155e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13873128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019577480852603912, + "skip_count": 1.0, + "step": 8604, + "text_loss": 0.425156831741333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 9.686032684762408e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13876603.0, + "repeat_count": 0.0, + "routers_loss": 0.001554530463181436, + "skip_count": 1.0, + "step": 8606, + "text_loss": 0.3596082329750061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 9.667731425861975e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13879602.0, + "repeat_count": 0.0, + "routers_loss": 0.0027400986291468143, + "skip_count": 0.0, + "step": 8608, + "text_loss": 0.12101534754037857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.649445622362957e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13882204.0, + "repeat_count": 0.0, + "routers_loss": 0.001957559958100319, + "skip_count": 2.0, + "step": 8610, + "text_loss": 0.382834255695343 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 9.631175281272491e-05, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13886397.0, + "repeat_count": 1.0, + "routers_loss": 0.009613300673663616, + "skip_count": 3.0, + "step": 8612, + "text_loss": 0.24718235433101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 9.612920409591813e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13889625.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159029280766845, + "skip_count": 0.0, + "step": 8614, + "text_loss": 0.406452476978302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.59468101431622e-05, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 13892518.0, + "repeat_count": 0.0, + "routers_loss": 0.008069832809269428, + "skip_count": 3.0, + "step": 8616, + "text_loss": 0.19740329682826996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 9.576457102435082e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13895822.0, + "repeat_count": 0.0, + "routers_loss": 0.0024340536911040545, + "skip_count": 0.0, + "step": 8618, + "text_loss": 0.44761306047439575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 9.558248680931841e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13898829.0, + "repeat_count": 2.0, + "routers_loss": 0.0053517078049480915, + "skip_count": 1.0, + "step": 8620, + "text_loss": 0.37335118651390076 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.47901379512768, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.540055756783994e-05, + "loss": 0.0061, + "macro_f1": 0.9255813956260681, + "num_tokens": 13902122.0, + "repeat_count": 3.0, + "routers_loss": 0.03885587304830551, + "skip_count": 4.0, + "step": 8622, + "text_loss": 0.21311092376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 9.521878336963108e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13904874.0, + "repeat_count": 0.0, + "routers_loss": 0.007965708151459694, + "skip_count": 1.0, + "step": 8624, + "text_loss": 0.27229398488998413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 9.5037164284348e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13907755.0, + "repeat_count": 0.0, + "routers_loss": 0.0019825168419629335, + "skip_count": 0.0, + "step": 8626, + "text_loss": 0.6535577178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.507191077194015, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 9.485570038158747e-05, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 13910619.0, + "repeat_count": 1.0, + "routers_loss": 0.017803344875574112, + "skip_count": 0.0, + "step": 8628, + "text_loss": 0.26617178320884705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.467439173088687e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 13914098.0, + "repeat_count": 0.0, + "routers_loss": 0.0025836096610873938, + "skip_count": 0.0, + "step": 8630, + "text_loss": 0.44465285539627075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.44932384017238e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13917192.0, + "repeat_count": 0.0, + "routers_loss": 0.004438584204763174, + "skip_count": 2.0, + "step": 8632, + "text_loss": 0.33622798323631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 9.431224046351688e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 13920067.0, + "repeat_count": 0.0, + "routers_loss": 0.017312567681074142, + "skip_count": 2.0, + "step": 8634, + "text_loss": 0.31870952248573303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 9.413139798562476e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13922887.0, + "repeat_count": 0.0, + "routers_loss": 0.0019389945082366467, + "skip_count": 0.0, + "step": 8636, + "text_loss": 0.18223261833190918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 9.395071103734648e-05, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13926545.0, + "repeat_count": 0.0, + "routers_loss": 0.0011485094437375665, + "skip_count": 0.0, + "step": 8638, + "text_loss": 0.48031774163246155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.377017968792179e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13931171.0, + "repeat_count": 1.0, + "routers_loss": 0.003448521951213479, + "skip_count": 0.0, + "step": 8640, + "text_loss": 0.7585139870643616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 40.57293806868213, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0213623046875, + "learning_rate": 9.35898040065305e-05, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 13934369.0, + "repeat_count": 0.0, + "routers_loss": 0.017959754914045334, + "skip_count": 2.0, + "step": 8642, + "text_loss": 0.49708613753318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 9.3409584062293e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13938166.0, + "repeat_count": 0.0, + "routers_loss": 0.004092653747648001, + "skip_count": 1.0, + "step": 8644, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.322951992426992e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13941922.0, + "repeat_count": 0.0, + "routers_loss": 0.0026206092443317175, + "skip_count": 0.0, + "step": 8646, + "text_loss": 0.4735889434814453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.304961166146209e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 13945569.0, + "repeat_count": 3.0, + "routers_loss": 0.005156307481229305, + "skip_count": 2.0, + "step": 8648, + "text_loss": 0.5630270838737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 9.286985934281079e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13948357.0, + "repeat_count": 0.0, + "routers_loss": 0.004913610871881247, + "skip_count": 1.0, + "step": 8650, + "text_loss": 0.4053497016429901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 9.26902630371974e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13952543.0, + "repeat_count": 0.0, + "routers_loss": 0.003946282435208559, + "skip_count": 2.0, + "step": 8652, + "text_loss": 0.40166863799095154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.251082281344358e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13955917.0, + "repeat_count": 0.0, + "routers_loss": 0.0009605551022104919, + "skip_count": 0.0, + "step": 8654, + "text_loss": 0.20477983355522156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.233153874031102e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13960071.0, + "repeat_count": 0.0, + "routers_loss": 0.004408199340105057, + "skip_count": 3.0, + "step": 8656, + "text_loss": 0.3349814713001251 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 9.215241088650194e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13963125.0, + "repeat_count": 1.0, + "routers_loss": 0.005541396792978048, + "skip_count": 2.0, + "step": 8658, + "text_loss": 0.6602919697761536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 9.197343932065843e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13966130.0, + "repeat_count": 0.0, + "routers_loss": 0.001636760076507926, + "skip_count": 0.0, + "step": 8660, + "text_loss": 0.7704628109931946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 9.179462411136263e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13969791.0, + "repeat_count": 0.0, + "routers_loss": 0.0006453761598095298, + "skip_count": 0.0, + "step": 8662, + "text_loss": 0.3898075520992279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 9.161596532713695e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 13972987.0, + "repeat_count": 0.0, + "routers_loss": 0.005081792362034321, + "skip_count": 4.0, + "step": 8664, + "text_loss": 0.8477506041526794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.143746303644374e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13976505.0, + "repeat_count": 0.0, + "routers_loss": 0.0032063762191683054, + "skip_count": 0.0, + "step": 8666, + "text_loss": 0.23729658126831055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 9.125911730768543e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13980061.0, + "repeat_count": 0.0, + "routers_loss": 0.00043821477447636425, + "skip_count": 0.0, + "step": 8668, + "text_loss": 0.4233637750148773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 9.108092820920438e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13983407.0, + "repeat_count": 0.0, + "routers_loss": 0.007779054809361696, + "skip_count": 2.0, + "step": 8670, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 9.090289580928307e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13986725.0, + "repeat_count": 0.0, + "routers_loss": 0.0018697676714509726, + "skip_count": 1.0, + "step": 8672, + "text_loss": 1.0568488836288452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 9.072502017614382e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 13990765.0, + "repeat_count": 0.0, + "routers_loss": 0.002077789744362235, + "skip_count": 0.0, + "step": 8674, + "text_loss": 0.48911142349243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.73260933372468, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.054730137794887e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 13994083.0, + "repeat_count": 1.0, + "routers_loss": 0.044373031705617905, + "skip_count": 3.0, + "step": 8676, + "text_loss": 0.3420281708240509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 9.036973948280048e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13997500.0, + "repeat_count": 0.0, + "routers_loss": 0.0015431724023073912, + "skip_count": 0.0, + "step": 8678, + "text_loss": 0.21514096856117249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.019233455874049e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 14000460.0, + "repeat_count": 0.0, + "routers_loss": 0.006088062655180693, + "skip_count": 1.0, + "step": 8680, + "text_loss": 0.43932875990867615 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 9.001508667375107e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14003537.0, + "repeat_count": 2.0, + "routers_loss": 0.01006145216524601, + "skip_count": 3.0, + "step": 8682, + "text_loss": 0.2192728966474533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 8.983799589575393e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14005943.0, + "repeat_count": 0.0, + "routers_loss": 0.001044525415636599, + "skip_count": 0.0, + "step": 8684, + "text_loss": 0.8686383962631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.96610622926104e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14008954.0, + "repeat_count": 0.0, + "routers_loss": 0.004876079503446817, + "skip_count": 2.0, + "step": 8686, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.948428593212193e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14012268.0, + "repeat_count": 1.0, + "routers_loss": 0.007909095846116543, + "skip_count": 2.0, + "step": 8688, + "text_loss": 0.17117907106876373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 8.930766688202946e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14015192.0, + "repeat_count": 0.0, + "routers_loss": 0.0022194553166627884, + "skip_count": 0.0, + "step": 8690, + "text_loss": 0.637697160243988 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 8.913120521001383e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14018055.0, + "repeat_count": 1.0, + "routers_loss": 0.0023777696769684553, + "skip_count": 0.0, + "step": 8692, + "text_loss": 0.39099860191345215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.895490098369535e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14021035.0, + "repeat_count": 0.0, + "routers_loss": 0.002676652278751135, + "skip_count": 1.0, + "step": 8694, + "text_loss": 0.6112156510353088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 8.877875427063431e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14023759.0, + "repeat_count": 0.0, + "routers_loss": 0.001040685223415494, + "skip_count": 0.0, + "step": 8696, + "text_loss": 0.3562681972980499 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 8.86027651383302e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14026090.0, + "repeat_count": 1.0, + "routers_loss": 0.0011444527190178633, + "skip_count": 0.0, + "step": 8698, + "text_loss": 0.6152632236480713 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.84531846199002, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 8.842693365422266e-05, + "loss": 0.008, + "macro_f1": 0.8817967176437378, + "num_tokens": 14029570.0, + "repeat_count": 2.0, + "routers_loss": 0.024327632039785385, + "skip_count": 3.0, + "step": 8700, + "text_loss": 0.2170596867799759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.825125988569061e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14032418.0, + "repeat_count": 0.0, + "routers_loss": 0.00048010432510636747, + "skip_count": 0.0, + "step": 8702, + "text_loss": 0.4421340525150299 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 8.807574390005241e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14035610.0, + "repeat_count": 0.0, + "routers_loss": 0.0010498231276869774, + "skip_count": 0.0, + "step": 8704, + "text_loss": 0.3656717538833618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 8.790038576456627e-05, + "loss": 0.0045, + "macro_f1": 0.3272727429866791, + "num_tokens": 14039354.0, + "repeat_count": 0.0, + "routers_loss": 0.019302964210510254, + "skip_count": 1.0, + "step": 8706, + "text_loss": 0.6150856018066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 8.772518554642972e-05, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 14042353.0, + "repeat_count": 0.0, + "routers_loss": 0.004211598541587591, + "skip_count": 0.0, + "step": 8708, + "text_loss": 0.17178772389888763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 8.755014331277972e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14045704.0, + "repeat_count": 0.0, + "routers_loss": 0.0007902922225184739, + "skip_count": 0.0, + "step": 8710, + "text_loss": 0.6289885640144348 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 8.737525913069277e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 14048743.0, + "repeat_count": 1.0, + "routers_loss": 0.007915202528238297, + "skip_count": 2.0, + "step": 8712, + "text_loss": 0.2778690457344055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 8.720053306718506e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14052762.0, + "repeat_count": 0.0, + "routers_loss": 0.0027877227403223515, + "skip_count": 3.0, + "step": 8714, + "text_loss": 0.3615926504135132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.92045788083358, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 8.702596518921175e-05, + "loss": 0.0086, + "macro_f1": 0.6603773832321167, + "num_tokens": 14056645.0, + "repeat_count": 1.0, + "routers_loss": 0.03460995852947235, + "skip_count": 1.0, + "step": 8716, + "text_loss": 0.19412031769752502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 8.685155556366763e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14059604.0, + "repeat_count": 1.0, + "routers_loss": 0.0026834046002477407, + "skip_count": 2.0, + "step": 8718, + "text_loss": 0.4414670169353485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.667730425738679e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14062170.0, + "repeat_count": 0.0, + "routers_loss": 0.01547359861433506, + "skip_count": 4.0, + "step": 8720, + "text_loss": 0.2850716710090637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 8.650321133714267e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 14065526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020194994285702705, + "skip_count": 0.0, + "step": 8722, + "text_loss": 0.1776508241891861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 8.632927686964798e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14068525.0, + "repeat_count": 0.0, + "routers_loss": 0.0037195945624262094, + "skip_count": 0.0, + "step": 8724, + "text_loss": 0.2786005735397339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 8.615550092155477e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 14071830.0, + "repeat_count": 1.0, + "routers_loss": 0.008169961161911488, + "skip_count": 4.0, + "step": 8726, + "text_loss": 0.43228310346603394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 8.598188355945424e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14074977.0, + "repeat_count": 0.0, + "routers_loss": 0.006407112814486027, + "skip_count": 1.0, + "step": 8728, + "text_loss": 0.24443474411964417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 8.580842484987689e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14078104.0, + "repeat_count": 0.0, + "routers_loss": 0.001878641895018518, + "skip_count": 1.0, + "step": 8730, + "text_loss": 0.4559098184108734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 8.563512485929253e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14081934.0, + "repeat_count": 0.0, + "routers_loss": 0.0056114462204277515, + "skip_count": 0.0, + "step": 8732, + "text_loss": 0.3063429594039917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 8.546198365411007e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14085097.0, + "repeat_count": 1.0, + "routers_loss": 0.001542840269394219, + "skip_count": 0.0, + "step": 8734, + "text_loss": 0.7624274492263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 8.528900130067741e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14088630.0, + "repeat_count": 0.0, + "routers_loss": 0.002677374053746462, + "skip_count": 0.0, + "step": 8736, + "text_loss": 0.18395234644412994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 8.511617786528175e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14091513.0, + "repeat_count": 1.0, + "routers_loss": 0.004059800878167152, + "skip_count": 0.0, + "step": 8738, + "text_loss": 0.4567817449569702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 8.494351341414947e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 14094500.0, + "repeat_count": 1.0, + "routers_loss": 0.0023724427446722984, + "skip_count": 1.0, + "step": 8740, + "text_loss": 0.6925744414329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0155029296875, + "learning_rate": 8.477100801344573e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14097518.0, + "repeat_count": 0.0, + "routers_loss": 0.0013842503540217876, + "skip_count": 2.0, + "step": 8742, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.459866172927505e-05, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 14101219.0, + "repeat_count": 0.0, + "routers_loss": 0.003597316099330783, + "skip_count": 2.0, + "step": 8744, + "text_loss": 0.785912036895752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 41.061050777810394, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.027099609375, + "learning_rate": 8.442647462768082e-05, + "loss": 0.0066, + "macro_f1": 0.6225374937057495, + "num_tokens": 14104460.0, + "repeat_count": 0.0, + "routers_loss": 0.01929798349738121, + "skip_count": 5.0, + "step": 8746, + "text_loss": 0.2111714482307434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 8.425444677464545e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14107404.0, + "repeat_count": 0.0, + "routers_loss": 0.00048497592797502875, + "skip_count": 0.0, + "step": 8748, + "text_loss": 0.4764930307865143 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 8.408257823609033e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14109917.0, + "repeat_count": 1.0, + "routers_loss": 0.007886217907071114, + "skip_count": 2.0, + "step": 8750, + "text_loss": 0.2771969735622406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 8.391086907787587e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14112649.0, + "repeat_count": 0.0, + "routers_loss": 0.006535434629768133, + "skip_count": 0.0, + "step": 8752, + "text_loss": 0.1550854742527008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 8.373931936580114e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14116044.0, + "repeat_count": 0.0, + "routers_loss": 0.002130605047568679, + "skip_count": 0.0, + "step": 8754, + "text_loss": 0.4055478870868683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 8.356792916560457e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14119097.0, + "repeat_count": 0.0, + "routers_loss": 0.0005611231899820268, + "skip_count": 0.0, + "step": 8756, + "text_loss": 0.47804903984069824 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 8.339669854296316e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14122079.0, + "repeat_count": 2.0, + "routers_loss": 0.005650801584124565, + "skip_count": 0.0, + "step": 8758, + "text_loss": 0.1968296617269516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 8.322562756349273e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14124910.0, + "repeat_count": 0.0, + "routers_loss": 0.0035948604345321655, + "skip_count": 1.0, + "step": 8760, + "text_loss": 0.4988253712654114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 8.305471629274802e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 14127767.0, + "repeat_count": 0.0, + "routers_loss": 0.0012090947711840272, + "skip_count": 0.0, + "step": 8762, + "text_loss": 0.6330704689025879 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 8.288396479622262e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14130766.0, + "repeat_count": 0.0, + "routers_loss": 0.0010853242129087448, + "skip_count": 1.0, + "step": 8764, + "text_loss": 0.43057000637054443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.271337313934868e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14133804.0, + "repeat_count": 0.0, + "routers_loss": 0.0037055034190416336, + "skip_count": 2.0, + "step": 8766, + "text_loss": 0.31973564624786377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 8.254294138749741e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14137164.0, + "repeat_count": 0.0, + "routers_loss": 0.005338407587260008, + "skip_count": 0.0, + "step": 8768, + "text_loss": 0.5066531896591187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.237266960597844e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14140119.0, + "repeat_count": 0.0, + "routers_loss": 0.0014707009540870786, + "skip_count": 1.0, + "step": 8770, + "text_loss": 0.553493857383728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 8.220255786004033e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14143223.0, + "repeat_count": 0.0, + "routers_loss": 0.002113121096044779, + "skip_count": 0.0, + "step": 8772, + "text_loss": 0.40016281604766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 8.203260621487019e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14146366.0, + "repeat_count": 0.0, + "routers_loss": 0.002210963051766157, + "skip_count": 1.0, + "step": 8774, + "text_loss": 0.44022905826568604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 8.186281473559382e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14150009.0, + "repeat_count": 0.0, + "routers_loss": 0.0011857844656333327, + "skip_count": 0.0, + "step": 8776, + "text_loss": 0.572823703289032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 8.169318348727544e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14153343.0, + "repeat_count": 0.0, + "routers_loss": 0.0020397785119712353, + "skip_count": 1.0, + "step": 8778, + "text_loss": 0.5724276900291443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 8.152371253491841e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14156392.0, + "repeat_count": 0.0, + "routers_loss": 0.001745635992847383, + "skip_count": 0.0, + "step": 8780, + "text_loss": 0.14162923395633698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 8.135440194346416e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 14159616.0, + "repeat_count": 0.0, + "routers_loss": 0.002799858106300235, + "skip_count": 0.0, + "step": 8782, + "text_loss": 0.18205340206623077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 8.118525177779284e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14163531.0, + "repeat_count": 1.0, + "routers_loss": 0.0029223538003861904, + "skip_count": 0.0, + "step": 8784, + "text_loss": 0.4107058644294739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 8.101626210272311e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14166776.0, + "repeat_count": 0.0, + "routers_loss": 0.001209643087349832, + "skip_count": 0.0, + "step": 8786, + "text_loss": 0.6441596746444702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 8.084743298301211e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 14169586.0, + "repeat_count": 0.0, + "routers_loss": 0.0015196573222056031, + "skip_count": 0.0, + "step": 8788, + "text_loss": 0.35585930943489075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 8.067876448335549e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14174180.0, + "repeat_count": 0.0, + "routers_loss": 0.0004388966190163046, + "skip_count": 0.0, + "step": 8790, + "text_loss": 0.31594613194465637 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 8.05102566683873e-05, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 14177950.0, + "repeat_count": 1.0, + "routers_loss": 0.0031201441306620836, + "skip_count": 0.0, + "step": 8792, + "text_loss": 0.3161006569862366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 8.034190960268012e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14180642.0, + "repeat_count": 0.0, + "routers_loss": 0.001848527928814292, + "skip_count": 0.0, + "step": 8794, + "text_loss": 0.47571417689323425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 8.017372335074486e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14183743.0, + "repeat_count": 0.0, + "routers_loss": 0.0043064444325864315, + "skip_count": 1.0, + "step": 8796, + "text_loss": 0.5976942777633667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 8.000569797703072e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14187742.0, + "repeat_count": 0.0, + "routers_loss": 0.005383181851357222, + "skip_count": 2.0, + "step": 8798, + "text_loss": 0.2692606449127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 7.983783354592544e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14191211.0, + "repeat_count": 0.0, + "routers_loss": 0.001401974936015904, + "skip_count": 0.0, + "step": 8800, + "text_loss": 0.38108205795288086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 7.967013012175478e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14194992.0, + "repeat_count": 0.0, + "routers_loss": 0.001168998540379107, + "skip_count": 0.0, + "step": 8802, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 7.950258776878332e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14198059.0, + "repeat_count": 0.0, + "routers_loss": 0.0032015808392316103, + "skip_count": 2.0, + "step": 8804, + "text_loss": 0.6014752984046936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 7.933520655121351e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14202313.0, + "repeat_count": 0.0, + "routers_loss": 0.0009403078584000468, + "skip_count": 0.0, + "step": 8806, + "text_loss": 0.54194176197052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.916798653318607e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14205534.0, + "repeat_count": 0.0, + "routers_loss": 0.0027781077660620213, + "skip_count": 1.0, + "step": 8808, + "text_loss": 0.7181227803230286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 7.900092777878004e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14209357.0, + "repeat_count": 0.0, + "routers_loss": 0.0034586815163493156, + "skip_count": 1.0, + "step": 8810, + "text_loss": 0.21651209890842438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.883403035201265e-05, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 14212328.0, + "repeat_count": 1.0, + "routers_loss": 0.01194343063980341, + "skip_count": 4.0, + "step": 8812, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0157470703125, + "learning_rate": 7.866729431683938e-05, + "loss": 0.0038, + "macro_f1": 1.0, + "num_tokens": 14214979.0, + "repeat_count": 1.0, + "routers_loss": 0.0045132869854569435, + "skip_count": 1.0, + "step": 8814, + "text_loss": 0.4066837728023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0181884765625, + "learning_rate": 7.850071973715368e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14219030.0, + "repeat_count": 0.0, + "routers_loss": 0.005109346006065607, + "skip_count": 2.0, + "step": 8816, + "text_loss": 0.12459450960159302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 7.833430667678737e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14222117.0, + "repeat_count": 0.0, + "routers_loss": 0.0036401136312633753, + "skip_count": 0.0, + "step": 8818, + "text_loss": 0.3759046494960785 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 7.816805519951008e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14225546.0, + "repeat_count": 2.0, + "routers_loss": 0.006177824921905994, + "skip_count": 1.0, + "step": 8820, + "text_loss": 0.4031941592693329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 41.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 7.800196536902987e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14228731.0, + "repeat_count": 0.0, + "routers_loss": 0.009549650363624096, + "skip_count": 5.0, + "step": 8822, + "text_loss": 0.2895966172218323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 7.783603724899258e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14231796.0, + "repeat_count": 0.0, + "routers_loss": 0.005532847251743078, + "skip_count": 2.0, + "step": 8824, + "text_loss": 0.32433390617370605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 7.767027090298206e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14235869.0, + "repeat_count": 0.0, + "routers_loss": 0.0011165215400978923, + "skip_count": 0.0, + "step": 8826, + "text_loss": 0.41239091753959656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.750466639452059e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14238830.0, + "repeat_count": 0.0, + "routers_loss": 0.0007845646468922496, + "skip_count": 0.0, + "step": 8828, + "text_loss": 0.5113243460655212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.733922378706787e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14241672.0, + "repeat_count": 0.0, + "routers_loss": 0.0029602700378745794, + "skip_count": 1.0, + "step": 8830, + "text_loss": 0.22004501521587372 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 7.717394314402199e-05, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 14244522.0, + "repeat_count": 2.0, + "routers_loss": 0.005297200754284859, + "skip_count": 1.0, + "step": 8832, + "text_loss": 0.6039504408836365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 7.700882452871872e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14246964.0, + "repeat_count": 0.0, + "routers_loss": 0.0018059068825095892, + "skip_count": 2.0, + "step": 8834, + "text_loss": 0.46563026309013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.684386800443177e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14249387.0, + "repeat_count": 0.0, + "routers_loss": 0.005659483838826418, + "skip_count": 2.0, + "step": 8836, + "text_loss": 0.31516948342323303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 7.667907363437288e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14252438.0, + "repeat_count": 0.0, + "routers_loss": 0.011170750483870506, + "skip_count": 1.0, + "step": 8838, + "text_loss": 0.22867503762245178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 7.651444148169157e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14255490.0, + "repeat_count": 0.0, + "routers_loss": 0.004106760956346989, + "skip_count": 2.0, + "step": 8840, + "text_loss": 0.5757828950881958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 7.634997160947499e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14258430.0, + "repeat_count": 0.0, + "routers_loss": 0.0008562540751881897, + "skip_count": 0.0, + "step": 8842, + "text_loss": 0.5166661143302917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 7.618566408074862e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14261275.0, + "repeat_count": 0.0, + "routers_loss": 0.0012901517329737544, + "skip_count": 0.0, + "step": 8844, + "text_loss": 0.7376981973648071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 7.602151895847526e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14264698.0, + "repeat_count": 0.0, + "routers_loss": 0.00267209205776453, + "skip_count": 0.0, + "step": 8846, + "text_loss": 0.5249470472335815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 41.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 7.585753630555565e-05, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 14267887.0, + "repeat_count": 1.0, + "routers_loss": 0.015334542840719223, + "skip_count": 7.0, + "step": 8848, + "text_loss": 1.1539889574050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 7.569371618482818e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14271392.0, + "repeat_count": 0.0, + "routers_loss": 0.0010222389828413725, + "skip_count": 0.0, + "step": 8850, + "text_loss": 0.33968010544776917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 7.553005865906914e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 14274658.0, + "repeat_count": 0.0, + "routers_loss": 0.0006116362637840211, + "skip_count": 0.0, + "step": 8852, + "text_loss": 0.7514221668243408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 7.536656379099221e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14277763.0, + "repeat_count": 0.0, + "routers_loss": 0.0036474792286753654, + "skip_count": 0.0, + "step": 8854, + "text_loss": 0.3964846134185791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 7.520323164324921e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14281165.0, + "repeat_count": 0.0, + "routers_loss": 0.005498840939253569, + "skip_count": 1.0, + "step": 8856, + "text_loss": 0.2235594391822815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 7.504006227842919e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14284761.0, + "repeat_count": 2.0, + "routers_loss": 0.006513409782201052, + "skip_count": 0.0, + "step": 8858, + "text_loss": 0.45196816325187683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.48770557590589e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14287844.0, + "repeat_count": 0.0, + "routers_loss": 0.0013065916718915105, + "skip_count": 0.0, + "step": 8860, + "text_loss": 0.2188033014535904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 7.471421214760287e-05, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 14291280.0, + "repeat_count": 1.0, + "routers_loss": 0.0016644994029775262, + "skip_count": 0.0, + "step": 8862, + "text_loss": 0.7049906253814697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 7.455153150646299e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14294330.0, + "repeat_count": 1.0, + "routers_loss": 0.002664943691343069, + "skip_count": 0.0, + "step": 8864, + "text_loss": 0.2160239815711975 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 7.43890138979788e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 14298355.0, + "repeat_count": 1.0, + "routers_loss": 0.0035776710137724876, + "skip_count": 0.0, + "step": 8866, + "text_loss": 0.4922088384628296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.422665938442741e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 14301452.0, + "repeat_count": 0.0, + "routers_loss": 0.0029914912302047014, + "skip_count": 2.0, + "step": 8868, + "text_loss": 0.5828475952148438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 7.406446802802331e-05, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 14304667.0, + "repeat_count": 1.0, + "routers_loss": 0.0010031569981947541, + "skip_count": 2.0, + "step": 8870, + "text_loss": 0.657244861125946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 7.390243989091849e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14307397.0, + "repeat_count": 0.0, + "routers_loss": 0.007960405200719833, + "skip_count": 1.0, + "step": 8872, + "text_loss": 0.3147352635860443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 7.37405750352026e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14310687.0, + "repeat_count": 1.0, + "routers_loss": 0.007953251712024212, + "skip_count": 3.0, + "step": 8874, + "text_loss": 0.30315887928009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 7.357887352290227e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 14314007.0, + "repeat_count": 0.0, + "routers_loss": 0.0012103051412850618, + "skip_count": 0.0, + "step": 8876, + "text_loss": 0.6356115341186523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.341733541598217e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 14316696.0, + "repeat_count": 0.0, + "routers_loss": 0.0017898730002343655, + "skip_count": 1.0, + "step": 8878, + "text_loss": 0.35877764225006104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 7.325596077634383e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14320172.0, + "repeat_count": 0.0, + "routers_loss": 0.0007144945557229221, + "skip_count": 0.0, + "step": 8880, + "text_loss": 0.7939266562461853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 7.309474966582635e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14323262.0, + "repeat_count": 0.0, + "routers_loss": 0.001255290349945426, + "skip_count": 0.0, + "step": 8882, + "text_loss": 0.7115976810455322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 7.293370214620616e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14326826.0, + "repeat_count": 0.0, + "routers_loss": 0.0028131126891821623, + "skip_count": 2.0, + "step": 8884, + "text_loss": 0.24073036015033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 7.277281827919691e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14329658.0, + "repeat_count": 0.0, + "routers_loss": 0.0024797592777758837, + "skip_count": 1.0, + "step": 8886, + "text_loss": 0.47276070713996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.72791312004696, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.26120981264496e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 14333584.0, + "repeat_count": 1.0, + "routers_loss": 0.023670634254813194, + "skip_count": 3.0, + "step": 8888, + "text_loss": 0.47537583112716675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 7.245154174955254e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14336850.0, + "repeat_count": 0.0, + "routers_loss": 0.0009583478095009923, + "skip_count": 0.0, + "step": 8890, + "text_loss": 0.5258943438529968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 7.229114921003116e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14339940.0, + "repeat_count": 0.0, + "routers_loss": 0.006664840504527092, + "skip_count": 3.0, + "step": 8892, + "text_loss": 0.20986922085285187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 7.213092056934833e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14342737.0, + "repeat_count": 0.0, + "routers_loss": 0.0005362578085623682, + "skip_count": 0.0, + "step": 8894, + "text_loss": 0.5174402594566345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 7.197085588890383e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14345769.0, + "repeat_count": 0.0, + "routers_loss": 0.006428950000554323, + "skip_count": 1.0, + "step": 8896, + "text_loss": 0.657136857509613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.181095523003478e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14348563.0, + "repeat_count": 1.0, + "routers_loss": 0.0015549053205177188, + "skip_count": 0.0, + "step": 8898, + "text_loss": 0.49799686670303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 7.165121865401535e-05, + "loss": 0.0068, + "macro_f1": 0.32098764181137085, + "num_tokens": 14353134.0, + "repeat_count": 0.0, + "routers_loss": 0.030110027641057968, + "skip_count": 2.0, + "step": 8900, + "text_loss": 0.3644331693649292 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.149164622205712e-05, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 14356031.0, + "repeat_count": 1.0, + "routers_loss": 0.0014812488807365298, + "skip_count": 1.0, + "step": 8902, + "text_loss": 0.46983054280281067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.133223799530836e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14358941.0, + "repeat_count": 0.0, + "routers_loss": 0.001170543720945716, + "skip_count": 0.0, + "step": 8904, + "text_loss": 0.7030026316642761 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 7.117299403485466e-05, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 14361807.0, + "repeat_count": 1.0, + "routers_loss": 0.0011649372754618526, + "skip_count": 1.0, + "step": 8906, + "text_loss": 0.44989535212516785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 7.101391440171856e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14365464.0, + "repeat_count": 0.0, + "routers_loss": 0.0028165180701762438, + "skip_count": 0.0, + "step": 8908, + "text_loss": 0.487165629863739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 7.085499915685978e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14368149.0, + "repeat_count": 0.0, + "routers_loss": 0.001956705003976822, + "skip_count": 2.0, + "step": 8910, + "text_loss": 0.3717629909515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 7.069624836117484e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14371440.0, + "repeat_count": 0.0, + "routers_loss": 0.0027164234779775143, + "skip_count": 1.0, + "step": 8912, + "text_loss": 0.3683965802192688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 7.053766207549734e-05, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 14374965.0, + "repeat_count": 0.0, + "routers_loss": 0.005999395158141851, + "skip_count": 2.0, + "step": 8914, + "text_loss": 0.6271854639053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 7.037924036059789e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14378445.0, + "repeat_count": 0.0, + "routers_loss": 0.000978486379608512, + "skip_count": 0.0, + "step": 8916, + "text_loss": 0.5927628874778748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 7.022098327718401e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14382851.0, + "repeat_count": 0.0, + "routers_loss": 0.012569266371428967, + "skip_count": 1.0, + "step": 8918, + "text_loss": 0.4092319905757904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 41.878191957734074, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 7.006289088590007e-05, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 14386959.0, + "repeat_count": 0.0, + "routers_loss": 0.011032132431864738, + "skip_count": 2.0, + "step": 8920, + "text_loss": 0.6553854942321777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 6.990496324732737e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 14390031.0, + "repeat_count": 0.0, + "routers_loss": 0.001376329455524683, + "skip_count": 0.0, + "step": 8922, + "text_loss": 0.7792862057685852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 6.974720042198396e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14392966.0, + "repeat_count": 0.0, + "routers_loss": 0.005924372002482414, + "skip_count": 2.0, + "step": 8924, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 6.958960247032515e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14395619.0, + "repeat_count": 0.0, + "routers_loss": 0.010054769925773144, + "skip_count": 2.0, + "step": 8926, + "text_loss": 0.24784758687019348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 6.943216945274255e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14398891.0, + "repeat_count": 0.0, + "routers_loss": 0.0006864808965474367, + "skip_count": 0.0, + "step": 8928, + "text_loss": 0.5154114961624146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.927490142956489e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14402991.0, + "repeat_count": 0.0, + "routers_loss": 0.000996887218207121, + "skip_count": 0.0, + "step": 8930, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 6.911779846105753e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14406276.0, + "repeat_count": 1.0, + "routers_loss": 0.0007863475475460291, + "skip_count": 0.0, + "step": 8932, + "text_loss": 0.6862632632255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 6.896086060742262e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14409005.0, + "repeat_count": 0.0, + "routers_loss": 0.0020060581155121326, + "skip_count": 1.0, + "step": 8934, + "text_loss": 0.8998132348060608 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 6.880408792879905e-05, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 14411902.0, + "repeat_count": 2.0, + "routers_loss": 0.008094016462564468, + "skip_count": 3.0, + "step": 8936, + "text_loss": 0.3411460518836975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 6.864748048526237e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14414683.0, + "repeat_count": 0.0, + "routers_loss": 0.004374993033707142, + "skip_count": 0.0, + "step": 8938, + "text_loss": 0.24222217500209808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 6.84910383368249e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14417740.0, + "repeat_count": 0.0, + "routers_loss": 0.003004335332661867, + "skip_count": 2.0, + "step": 8940, + "text_loss": 0.5524137020111084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 6.83347615434356e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14420678.0, + "repeat_count": 0.0, + "routers_loss": 0.007001105695962906, + "skip_count": 2.0, + "step": 8942, + "text_loss": 0.3124033212661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.817865016497993e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14424259.0, + "repeat_count": 0.0, + "routers_loss": 0.0038414683658629656, + "skip_count": 0.0, + "step": 8944, + "text_loss": 0.509667694568634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 6.80227042612801e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14427084.0, + "repeat_count": 1.0, + "routers_loss": 0.008573584258556366, + "skip_count": 0.0, + "step": 8946, + "text_loss": 0.2533438205718994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 6.786692389209482e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14429690.0, + "repeat_count": 1.0, + "routers_loss": 0.003758789971470833, + "skip_count": 2.0, + "step": 8948, + "text_loss": 0.14571085572242737 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 6.771130911711953e-05, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 14432983.0, + "repeat_count": 0.0, + "routers_loss": 0.005996126215904951, + "skip_count": 2.0, + "step": 8950, + "text_loss": 0.24994049966335297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.755585999598613e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 14435772.0, + "repeat_count": 0.0, + "routers_loss": 0.0012271527666598558, + "skip_count": 0.0, + "step": 8952, + "text_loss": 0.3705698549747467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 6.740057658826293e-05, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 14438912.0, + "repeat_count": 1.0, + "routers_loss": 0.0017618577694520354, + "skip_count": 1.0, + "step": 8954, + "text_loss": 0.6691124439239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 6.72454589534548e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14441959.0, + "repeat_count": 0.0, + "routers_loss": 0.0016956349136307836, + "skip_count": 1.0, + "step": 8956, + "text_loss": 0.45412346720695496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 6.709050715100324e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14444804.0, + "repeat_count": 0.0, + "routers_loss": 0.017321301624178886, + "skip_count": 2.0, + "step": 8958, + "text_loss": 0.2668265998363495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.69357212402859e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14447390.0, + "repeat_count": 0.0, + "routers_loss": 0.005267233122140169, + "skip_count": 2.0, + "step": 8960, + "text_loss": 0.35546016693115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 6.67811012806172e-05, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 14451286.0, + "repeat_count": 0.0, + "routers_loss": 0.0045175012201070786, + "skip_count": 3.0, + "step": 8962, + "text_loss": 0.14669834077358246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 6.662664733124768e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14454335.0, + "repeat_count": 1.0, + "routers_loss": 0.004905698820948601, + "skip_count": 3.0, + "step": 8964, + "text_loss": 0.28777357935905457 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.09392427355445, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 6.647235945136442e-05, + "loss": 0.0074, + "macro_f1": 0.8823530077934265, + "num_tokens": 14457708.0, + "repeat_count": 2.0, + "routers_loss": 0.032136883586645126, + "skip_count": 1.0, + "step": 8966, + "text_loss": 0.2317836582660675 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 6.631823770009088e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14460721.0, + "repeat_count": 1.0, + "routers_loss": 0.0038611628115177155, + "skip_count": 1.0, + "step": 8968, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 6.616428213648656e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14463467.0, + "repeat_count": 0.0, + "routers_loss": 0.0006560821202583611, + "skip_count": 0.0, + "step": 8970, + "text_loss": 0.3474387526512146 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.60104928195479e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14466586.0, + "repeat_count": 1.0, + "routers_loss": 0.0016879125032573938, + "skip_count": 0.0, + "step": 8972, + "text_loss": 0.5454491972923279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.58568698082071e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14470125.0, + "repeat_count": 0.0, + "routers_loss": 0.0004945555119775236, + "skip_count": 0.0, + "step": 8974, + "text_loss": 0.4728975296020508 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.570341316133272e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 14473887.0, + "repeat_count": 2.0, + "routers_loss": 0.010141569189727306, + "skip_count": 3.0, + "step": 8976, + "text_loss": 0.24756617844104767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 6.555012293772967e-05, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 14477046.0, + "repeat_count": 1.0, + "routers_loss": 0.011950359679758549, + "skip_count": 2.0, + "step": 8978, + "text_loss": 0.25375646352767944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 6.539699919613911e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14480638.0, + "repeat_count": 0.0, + "routers_loss": 0.0007824545609764755, + "skip_count": 0.0, + "step": 8980, + "text_loss": 0.6888379454612732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 6.524404199523826e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14483723.0, + "repeat_count": 0.0, + "routers_loss": 0.004318726249039173, + "skip_count": 1.0, + "step": 8982, + "text_loss": 0.3603152334690094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.17845611975345, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 6.509125139364058e-05, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 14486876.0, + "repeat_count": 0.0, + "routers_loss": 0.010652635246515274, + "skip_count": 1.0, + "step": 8984, + "text_loss": 0.43394285440444946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 6.493862744989587e-05, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 14489944.0, + "repeat_count": 0.0, + "routers_loss": 0.0010475299786776304, + "skip_count": 0.0, + "step": 8986, + "text_loss": 0.5952020287513733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 6.478617022248984e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14493094.0, + "repeat_count": 0.0, + "routers_loss": 0.004329503979533911, + "skip_count": 1.0, + "step": 8988, + "text_loss": 0.7284399271011353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 6.463387976984437e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14496944.0, + "repeat_count": 0.0, + "routers_loss": 0.0019588395953178406, + "skip_count": 1.0, + "step": 8990, + "text_loss": 0.8103306889533997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 6.448175615031749e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14499997.0, + "repeat_count": 0.0, + "routers_loss": 0.008046228438615799, + "skip_count": 1.0, + "step": 8992, + "text_loss": 0.14758773148059845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 6.432979942220319e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 14503247.0, + "repeat_count": 1.0, + "routers_loss": 0.0028899910394102335, + "skip_count": 0.0, + "step": 8994, + "text_loss": 0.2568151652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 6.417800964373161e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14506244.0, + "repeat_count": 0.0, + "routers_loss": 0.0042211092077195644, + "skip_count": 2.0, + "step": 8996, + "text_loss": 0.3506850600242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 6.402638687306872e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14510502.0, + "repeat_count": 0.0, + "routers_loss": 0.003309462917968631, + "skip_count": 0.0, + "step": 8998, + "text_loss": 0.5852319598197937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 6.387493116831699e-05, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 14513679.0, + "repeat_count": 1.0, + "routers_loss": 0.015246274881064892, + "skip_count": 5.0, + "step": 9000, + "text_loss": 0.4266709089279175 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 6.372364258751434e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 14516862.0, + "repeat_count": 2.0, + "routers_loss": 0.005648075137287378, + "skip_count": 2.0, + "step": 9002, + "text_loss": 0.34153711795806885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 6.357252118863482e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14519660.0, + "repeat_count": 0.0, + "routers_loss": 0.005153972655534744, + "skip_count": 3.0, + "step": 9004, + "text_loss": 0.3911980092525482 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 6.342156702958851e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14522261.0, + "repeat_count": 0.0, + "routers_loss": 0.001209715730510652, + "skip_count": 0.0, + "step": 9006, + "text_loss": 0.45400822162628174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 6.327078016822124e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14525368.0, + "repeat_count": 0.0, + "routers_loss": 0.00367624219506979, + "skip_count": 1.0, + "step": 9008, + "text_loss": 0.5327706336975098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 6.31201606623149e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14528253.0, + "repeat_count": 0.0, + "routers_loss": 0.0018971028039231896, + "skip_count": 0.0, + "step": 9010, + "text_loss": 0.19216643273830414 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 6.296970856958712e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14531214.0, + "repeat_count": 1.0, + "routers_loss": 0.003927265293896198, + "skip_count": 0.0, + "step": 9012, + "text_loss": 0.3931650221347809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 6.281942394769142e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14535063.0, + "repeat_count": 0.0, + "routers_loss": 0.00801338441669941, + "skip_count": 0.0, + "step": 9014, + "text_loss": 0.1605554074048996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 6.266930685421717e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14538690.0, + "repeat_count": 0.0, + "routers_loss": 0.0013267790200188756, + "skip_count": 0.0, + "step": 9016, + "text_loss": 0.4797641932964325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.251935734668957e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14542591.0, + "repeat_count": 0.0, + "routers_loss": 0.0013866537483409047, + "skip_count": 1.0, + "step": 9018, + "text_loss": 0.4539037346839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 6.236957548256945e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14545259.0, + "repeat_count": 0.0, + "routers_loss": 0.001481749233789742, + "skip_count": 0.0, + "step": 9020, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 6.22199613192535e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14548362.0, + "repeat_count": 0.0, + "routers_loss": 0.005995423533022404, + "skip_count": 1.0, + "step": 9022, + "text_loss": 0.6533607244491577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 6.207051491407428e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14551694.0, + "repeat_count": 0.0, + "routers_loss": 0.015427720732986927, + "skip_count": 4.0, + "step": 9024, + "text_loss": 0.33537840843200684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 6.192123632429986e-05, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 14554614.0, + "repeat_count": 1.0, + "routers_loss": 0.0017432396998628974, + "skip_count": 0.0, + "step": 9026, + "text_loss": 0.9725127220153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 6.177212560713413e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 14559474.0, + "repeat_count": 0.0, + "routers_loss": 0.002909898292273283, + "skip_count": 2.0, + "step": 9028, + "text_loss": 0.16944198310375214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 6.162318281971652e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14563046.0, + "repeat_count": 0.0, + "routers_loss": 0.00274385092779994, + "skip_count": 0.0, + "step": 9030, + "text_loss": 0.43176764249801636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 6.147440801912218e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14565829.0, + "repeat_count": 1.0, + "routers_loss": 0.0024230771232396364, + "skip_count": 0.0, + "step": 9032, + "text_loss": 0.5683854818344116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 6.132580126236197e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14569016.0, + "repeat_count": 0.0, + "routers_loss": 0.004686394706368446, + "skip_count": 1.0, + "step": 9034, + "text_loss": 0.5422781705856323 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 42.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 6.117736260638223e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14572558.0, + "repeat_count": 2.0, + "routers_loss": 0.0010892068967223167, + "skip_count": 1.0, + "step": 9036, + "text_loss": 0.5740243196487427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 6.102909210806495e-05, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 14575969.0, + "repeat_count": 1.0, + "routers_loss": 0.0163960512727499, + "skip_count": 0.0, + "step": 9038, + "text_loss": 0.4803958535194397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 6.088098982422768e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14578746.0, + "repeat_count": 0.0, + "routers_loss": 0.0020733694545924664, + "skip_count": 0.0, + "step": 9040, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.45083651306135, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 6.073305581162342e-05, + "loss": 0.0066, + "macro_f1": 0.6601307392120361, + "num_tokens": 14581856.0, + "repeat_count": 1.0, + "routers_loss": 0.022739989683032036, + "skip_count": 2.0, + "step": 9042, + "text_loss": 0.5871608257293701 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 6.058529012694086e-05, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 14584754.0, + "repeat_count": 1.0, + "routers_loss": 0.012138293124735355, + "skip_count": 2.0, + "step": 9044, + "text_loss": 0.18492890894412994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 6.0437692826803893e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14587867.0, + "repeat_count": 0.0, + "routers_loss": 0.0009839123813435435, + "skip_count": 0.0, + "step": 9046, + "text_loss": 0.5532476902008057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 6.029026396777237e-05, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 14591521.0, + "repeat_count": 2.0, + "routers_loss": 0.01392262615263462, + "skip_count": 5.0, + "step": 9048, + "text_loss": 0.20356278121471405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.48840622248312, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 6.0143003606341174e-05, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 14595358.0, + "repeat_count": 0.0, + "routers_loss": 0.018218200653791428, + "skip_count": 1.0, + "step": 9050, + "text_loss": 0.3070164620876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 5.9995911798940764e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14598696.0, + "repeat_count": 0.0, + "routers_loss": 0.0002688709646463394, + "skip_count": 1.0, + "step": 9052, + "text_loss": 0.5637917518615723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.984898860193694e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14602301.0, + "repeat_count": 0.0, + "routers_loss": 0.003135781968012452, + "skip_count": 0.0, + "step": 9054, + "text_loss": 0.345111608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 5.9702234071631e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14606625.0, + "repeat_count": 0.0, + "routers_loss": 0.002299862913787365, + "skip_count": 0.0, + "step": 9056, + "text_loss": 0.30707255005836487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 5.9555648264259576e-05, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 14610303.0, + "repeat_count": 1.0, + "routers_loss": 0.0007164468406699598, + "skip_count": 0.0, + "step": 9058, + "text_loss": 0.56083083152771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 5.940923123599462e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14613211.0, + "repeat_count": 0.0, + "routers_loss": 0.00136603566352278, + "skip_count": 0.0, + "step": 9060, + "text_loss": 0.4455239474773407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 5.926298304294336e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14615844.0, + "repeat_count": 0.0, + "routers_loss": 0.001727075781673193, + "skip_count": 0.0, + "step": 9062, + "text_loss": 0.5928102731704712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 5.911690374114842e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 14619190.0, + "repeat_count": 0.0, + "routers_loss": 0.0022300337441265583, + "skip_count": 0.0, + "step": 9064, + "text_loss": 0.9456163048744202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 5.8970993386587676e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 14622304.0, + "repeat_count": 0.0, + "routers_loss": 0.006507525686174631, + "skip_count": 2.0, + "step": 9066, + "text_loss": 0.1809750199317932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 5.882525203517419e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14625386.0, + "repeat_count": 0.0, + "routers_loss": 0.0022866397630423307, + "skip_count": 0.0, + "step": 9068, + "text_loss": 0.1849939227104187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 5.867967974275629e-05, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 14628472.0, + "repeat_count": 1.0, + "routers_loss": 0.0058460538275539875, + "skip_count": 2.0, + "step": 9070, + "text_loss": 0.2627561688423157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 5.853427656511773e-05, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 14631187.0, + "repeat_count": 1.0, + "routers_loss": 0.0085217310115695, + "skip_count": 2.0, + "step": 9072, + "text_loss": 0.18039973080158234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 5.838904255797717e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 14633919.0, + "repeat_count": 1.0, + "routers_loss": 0.007423012051731348, + "skip_count": 4.0, + "step": 9074, + "text_loss": 0.23746201395988464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 5.8243977776988585e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14636674.0, + "repeat_count": 0.0, + "routers_loss": 0.0011181328445672989, + "skip_count": 0.0, + "step": 9076, + "text_loss": 0.38140806555747986 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.619900205459345, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.8099082277741024e-05, + "loss": 0.0052, + "macro_f1": 0.9262410998344421, + "num_tokens": 14639506.0, + "repeat_count": 3.0, + "routers_loss": 0.03306882083415985, + "skip_count": 2.0, + "step": 9078, + "text_loss": 0.2627770006656647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 5.795435611575872e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14642955.0, + "repeat_count": 0.0, + "routers_loss": 0.0014759303303435445, + "skip_count": 0.0, + "step": 9080, + "text_loss": 0.47112786769866943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 5.78097993465011e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14646018.0, + "repeat_count": 0.0, + "routers_loss": 0.003744201036170125, + "skip_count": 0.0, + "step": 9082, + "text_loss": 0.36873605847358704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 5.7665412025362516e-05, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 14649402.0, + "repeat_count": 0.0, + "routers_loss": 0.002992798574268818, + "skip_count": 2.0, + "step": 9084, + "text_loss": 0.6350628137588501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 5.752119420767243e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14652248.0, + "repeat_count": 0.0, + "routers_loss": 0.005798593629151583, + "skip_count": 2.0, + "step": 9086, + "text_loss": 0.2512637972831726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 5.7377145948695474e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 14655060.0, + "repeat_count": 0.0, + "routers_loss": 0.0024162146728485823, + "skip_count": 0.0, + "step": 9088, + "text_loss": 0.4233066439628601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.723326730363115e-05, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 14658873.0, + "repeat_count": 1.0, + "routers_loss": 0.004826475866138935, + "skip_count": 4.0, + "step": 9090, + "text_loss": 0.45946353673934937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 5.7089558327614036e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 14661865.0, + "repeat_count": 0.0, + "routers_loss": 0.0020765739027410746, + "skip_count": 2.0, + "step": 9092, + "text_loss": 0.9425542950630188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 5.694601907571356e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14666085.0, + "repeat_count": 0.0, + "routers_loss": 0.0012533976696431637, + "skip_count": 0.0, + "step": 9094, + "text_loss": 0.6307007670402527 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 5.680264960293446e-05, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 14668992.0, + "repeat_count": 1.0, + "routers_loss": 0.013796845450997353, + "skip_count": 5.0, + "step": 9096, + "text_loss": 0.21720129251480103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 5.665944996421612e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 14672365.0, + "repeat_count": 0.0, + "routers_loss": 0.004391494672745466, + "skip_count": 0.0, + "step": 9098, + "text_loss": 0.28794240951538086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 5.651642021443287e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14676232.0, + "repeat_count": 0.0, + "routers_loss": 0.0006779583054594696, + "skip_count": 0.0, + "step": 9100, + "text_loss": 0.45190441608428955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 42.73260933372468, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0213623046875, + "learning_rate": 5.637356040839398e-05, + "loss": 0.0049, + "macro_f1": 0.6289562582969666, + "num_tokens": 14679582.0, + "repeat_count": 0.0, + "routers_loss": 0.02379363216459751, + "skip_count": 6.0, + "step": 9102, + "text_loss": 0.3395652770996094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 5.623087060084364e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 14683438.0, + "repeat_count": 0.0, + "routers_loss": 0.00344930961728096, + "skip_count": 4.0, + "step": 9104, + "text_loss": 0.4345538914203644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 5.60883508464608e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 14686333.0, + "repeat_count": 0.0, + "routers_loss": 0.005554547533392906, + "skip_count": 3.0, + "step": 9106, + "text_loss": 0.5202528238296509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 5.594600119985932e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14690754.0, + "repeat_count": 0.0, + "routers_loss": 0.004589532967656851, + "skip_count": 1.0, + "step": 9108, + "text_loss": 0.3040390610694885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.77017904314646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 5.580382171558784e-05, + "loss": 0.0055, + "macro_f1": 0.32098764181137085, + "num_tokens": 14693793.0, + "repeat_count": 0.0, + "routers_loss": 0.029969461262226105, + "skip_count": 2.0, + "step": 9110, + "text_loss": 0.3644331693649292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 5.566181244812979e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14697290.0, + "repeat_count": 0.0, + "routers_loss": 0.003387648146599531, + "skip_count": 0.0, + "step": 9112, + "text_loss": 0.5177932977676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 5.5519973451903404e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14700597.0, + "repeat_count": 0.0, + "routers_loss": 0.004790942650288343, + "skip_count": 1.0, + "step": 9114, + "text_loss": 0.2132686972618103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 5.5378304781261715e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14703852.0, + "repeat_count": 0.0, + "routers_loss": 0.0007685191812925041, + "skip_count": 0.0, + "step": 9116, + "text_loss": 0.6690551042556763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 5.523680649049234e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14707218.0, + "repeat_count": 1.0, + "routers_loss": 0.0033531817607581615, + "skip_count": 0.0, + "step": 9118, + "text_loss": 0.26232191920280457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.81714117992369, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 5.509547863381781e-05, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 14710244.0, + "repeat_count": 1.0, + "routers_loss": 0.025616342201828957, + "skip_count": 0.0, + "step": 9120, + "text_loss": 0.2897983193397522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 5.495432126539507e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14713495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014400121290236712, + "skip_count": 0.0, + "step": 9122, + "text_loss": 0.4580271244049072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 5.481333443931602e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14716703.0, + "repeat_count": 0.0, + "routers_loss": 0.0008548611658625305, + "skip_count": 0.0, + "step": 9124, + "text_loss": 0.5140601992607117 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.84531846199002, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 5.4672518209607e-05, + "loss": 0.0075, + "macro_f1": 0.9255813956260681, + "num_tokens": 14719443.0, + "repeat_count": 3.0, + "routers_loss": 0.02092800848186016, + "skip_count": 4.0, + "step": 9126, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 5.4531872630228965e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14722711.0, + "repeat_count": 0.0, + "routers_loss": 0.0037711653858423233, + "skip_count": 0.0, + "step": 9128, + "text_loss": 0.3268158733844757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 5.4391397755077784e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 14725635.0, + "repeat_count": 0.0, + "routers_loss": 0.005959369707852602, + "skip_count": 0.0, + "step": 9130, + "text_loss": 0.44725099205970764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0181884765625, + "learning_rate": 5.425109363798358e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14728945.0, + "repeat_count": 0.0, + "routers_loss": 0.0011272960109636188, + "skip_count": 0.0, + "step": 9132, + "text_loss": 0.45580998063087463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 5.411096033271118e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14732271.0, + "repeat_count": 0.0, + "routers_loss": 0.0015554855344817042, + "skip_count": 0.0, + "step": 9134, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 5.3970997892959894e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 14735462.0, + "repeat_count": 4.0, + "routers_loss": 0.007287262007594109, + "skip_count": 5.0, + "step": 9136, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 5.383120637236366e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 14739288.0, + "repeat_count": 0.0, + "routers_loss": 0.004336730111390352, + "skip_count": 0.0, + "step": 9138, + "text_loss": 0.29503148794174194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 5.369158582449074e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 14742058.0, + "repeat_count": 0.0, + "routers_loss": 0.004528806544840336, + "skip_count": 0.0, + "step": 9140, + "text_loss": 0.16937516629695892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 5.3552136302844e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 14745628.0, + "repeat_count": 0.0, + "routers_loss": 0.0005676734144799411, + "skip_count": 0.0, + "step": 9142, + "text_loss": 0.48764488101005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 5.3412857860860917e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14748482.0, + "repeat_count": 0.0, + "routers_loss": 0.0017468055011704564, + "skip_count": 0.0, + "step": 9144, + "text_loss": 0.46164339780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.93924273554447, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 5.327375055191314e-05, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 14751091.0, + "repeat_count": 0.0, + "routers_loss": 0.007167307659983635, + "skip_count": 1.0, + "step": 9146, + "text_loss": 0.37566086649894714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 5.3134814429306896e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14753850.0, + "repeat_count": 0.0, + "routers_loss": 0.003801940008997917, + "skip_count": 2.0, + "step": 9148, + "text_loss": 0.17589576542377472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 5.299604954628268e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14756779.0, + "repeat_count": 0.0, + "routers_loss": 0.00396628538146615, + "skip_count": 1.0, + "step": 9150, + "text_loss": 0.4118746817111969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 42.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 5.2857455956015544e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14759574.0, + "repeat_count": 2.0, + "routers_loss": 0.003950111567974091, + "skip_count": 0.0, + "step": 9152, + "text_loss": 0.5839328169822693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 5.271903371161479e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14762802.0, + "repeat_count": 0.0, + "routers_loss": 0.0006622051005251706, + "skip_count": 1.0, + "step": 9154, + "text_loss": 0.40162989497184753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 5.2580782866124054e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14766136.0, + "repeat_count": 0.0, + "routers_loss": 0.003140404587611556, + "skip_count": 0.0, + "step": 9156, + "text_loss": 0.2028028815984726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 5.244270347252139e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 14769306.0, + "repeat_count": 0.0, + "routers_loss": 0.0035792726557701826, + "skip_count": 1.0, + "step": 9158, + "text_loss": 0.5611430406570435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 5.2304795583719034e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14771928.0, + "repeat_count": 0.0, + "routers_loss": 0.007276696152985096, + "skip_count": 2.0, + "step": 9160, + "text_loss": 0.1382172554731369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 5.2167059252563485e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14775047.0, + "repeat_count": 0.0, + "routers_loss": 0.003121814923360944, + "skip_count": 0.0, + "step": 9162, + "text_loss": 0.6130381226539612 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 43.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 5.2029494531835695e-05, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 14777746.0, + "repeat_count": 4.0, + "routers_loss": 0.006029475014656782, + "skip_count": 1.0, + "step": 9164, + "text_loss": 0.5901363492012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 43.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 5.189210147425061e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14780813.0, + "repeat_count": 0.0, + "routers_loss": 0.0034428017679601908, + "skip_count": 5.0, + "step": 9166, + "text_loss": 0.5909968018531799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 5.1754880132457494e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14785178.0, + "repeat_count": 0.0, + "routers_loss": 0.0025068193208426237, + "skip_count": 2.0, + "step": 9168, + "text_loss": 0.20257101953029633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 5.161783055904001e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 14788307.0, + "repeat_count": 0.0, + "routers_loss": 0.003352245781570673, + "skip_count": 0.0, + "step": 9170, + "text_loss": 0.20024186372756958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 5.1480952806515654e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14791053.0, + "repeat_count": 1.0, + "routers_loss": 0.0009423785959370434, + "skip_count": 0.0, + "step": 9172, + "text_loss": 0.6944412589073181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 5.13442469273363e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14794259.0, + "repeat_count": 0.0, + "routers_loss": 0.0016676477389410138, + "skip_count": 0.0, + "step": 9174, + "text_loss": 0.10889370739459991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 5.1207712973887875e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14797345.0, + "repeat_count": 0.0, + "routers_loss": 0.005842766724526882, + "skip_count": 2.0, + "step": 9176, + "text_loss": 0.17763052880764008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 5.107135099849042e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14800819.0, + "repeat_count": 0.0, + "routers_loss": 0.0004951528972014785, + "skip_count": 0.0, + "step": 9178, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 5.093516105339818e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 14803924.0, + "repeat_count": 0.0, + "routers_loss": 0.0031010014936327934, + "skip_count": 1.0, + "step": 9180, + "text_loss": 0.39177098870277405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 5.079914319079931e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14807083.0, + "repeat_count": 0.0, + "routers_loss": 0.00047361713950522244, + "skip_count": 0.0, + "step": 9182, + "text_loss": 0.39144888520240784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 5.066329746281617e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14810263.0, + "repeat_count": 0.0, + "routers_loss": 0.0018734827172011137, + "skip_count": 0.0, + "step": 9184, + "text_loss": 0.531446099281311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 5.052762392150506e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 14813761.0, + "repeat_count": 0.0, + "routers_loss": 0.00503428652882576, + "skip_count": 0.0, + "step": 9186, + "text_loss": 0.19398775696754456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 5.039212261885634e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 14817708.0, + "repeat_count": 0.0, + "routers_loss": 0.0010842647170647979, + "skip_count": 0.0, + "step": 9188, + "text_loss": 0.5365647077560425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 5.025679360679442e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 14820912.0, + "repeat_count": 2.0, + "routers_loss": 0.004775309935212135, + "skip_count": 2.0, + "step": 9190, + "text_loss": 0.6473321318626404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 5.012163693717747e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 14824115.0, + "repeat_count": 0.0, + "routers_loss": 0.004022061824798584, + "skip_count": 0.0, + "step": 9192, + "text_loss": 0.24432586133480072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 4.9986652661798025e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14827404.0, + "repeat_count": 0.0, + "routers_loss": 0.00231996551156044, + "skip_count": 1.0, + "step": 9194, + "text_loss": 0.7459486722946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 4.98518408323822e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 14830077.0, + "repeat_count": 0.0, + "routers_loss": 0.000999651150777936, + "skip_count": 0.0, + "step": 9196, + "text_loss": 0.5136345624923706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 4.971720150059012e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14833231.0, + "repeat_count": 0.0, + "routers_loss": 0.0033226648811250925, + "skip_count": 2.0, + "step": 9198, + "text_loss": 0.1597593128681183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 4.958273471801583e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 14836534.0, + "repeat_count": 0.0, + "routers_loss": 0.00400200579315424, + "skip_count": 0.0, + "step": 9200, + "text_loss": 0.16248664259910583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 4.94484405361873e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14840301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038636941462755203, + "skip_count": 0.0, + "step": 9202, + "text_loss": 0.20964740216732025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 4.9314319006566296e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14844094.0, + "repeat_count": 0.0, + "routers_loss": 0.00593461561948061, + "skip_count": 2.0, + "step": 9204, + "text_loss": 0.43311986327171326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 4.918037018054844e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 14847148.0, + "repeat_count": 0.0, + "routers_loss": 0.0007939442875795066, + "skip_count": 0.0, + "step": 9206, + "text_loss": 0.8805840015411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 4.904659410946311e-05, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 14851556.0, + "repeat_count": 2.0, + "routers_loss": 0.0058822291903197765, + "skip_count": 4.0, + "step": 9208, + "text_loss": 0.2123873233795166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 4.891299084457362e-05, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 14855208.0, + "repeat_count": 0.0, + "routers_loss": 0.0024413811042904854, + "skip_count": 0.0, + "step": 9210, + "text_loss": 0.4408712685108185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 4.8779560437076983e-05, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 14858433.0, + "repeat_count": 0.0, + "routers_loss": 0.007487752009183168, + "skip_count": 1.0, + "step": 9212, + "text_loss": 0.7417129874229431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 4.864630293810401e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 14861739.0, + "repeat_count": 0.0, + "routers_loss": 0.007972145453095436, + "skip_count": 2.0, + "step": 9214, + "text_loss": 0.3347324728965759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 4.851321839871908e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14865220.0, + "repeat_count": 0.0, + "routers_loss": 0.006238576490432024, + "skip_count": 1.0, + "step": 9216, + "text_loss": 0.49660998582839966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 4.838030686992062e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14868179.0, + "repeat_count": 0.0, + "routers_loss": 0.003592922119423747, + "skip_count": 0.0, + "step": 9218, + "text_loss": 0.316535621881485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 4.824756840264055e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14870950.0, + "repeat_count": 0.0, + "routers_loss": 0.012321153655648232, + "skip_count": 3.0, + "step": 9220, + "text_loss": 0.270915150642395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 4.8115003047744466e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14873749.0, + "repeat_count": 0.0, + "routers_loss": 0.0008396002231165767, + "skip_count": 0.0, + "step": 9222, + "text_loss": 0.4190096855163574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 4.798261085603162e-05, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 14877349.0, + "repeat_count": 0.0, + "routers_loss": 0.002983161248266697, + "skip_count": 1.0, + "step": 9224, + "text_loss": 0.8203139901161194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 4.785039187823503e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14881192.0, + "repeat_count": 0.0, + "routers_loss": 0.003951616585254669, + "skip_count": 2.0, + "step": 9226, + "text_loss": 0.36447709798812866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 4.771834616502119e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 14884608.0, + "repeat_count": 0.0, + "routers_loss": 0.001604852732270956, + "skip_count": 0.0, + "step": 9228, + "text_loss": 0.733951985836029 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.333431171118285, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 4.758647376699032e-05, + "loss": 0.0053, + "macro_f1": 0.8820862174034119, + "num_tokens": 14887963.0, + "repeat_count": 2.0, + "routers_loss": 0.041028670966625214, + "skip_count": 2.0, + "step": 9230, + "text_loss": 0.1800784021615982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 4.7454774734676074e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14890769.0, + "repeat_count": 0.0, + "routers_loss": 0.0027380166575312614, + "skip_count": 0.0, + "step": 9232, + "text_loss": 0.6017972230911255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.732324911854591e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 14894162.0, + "repeat_count": 0.0, + "routers_loss": 0.0018064725445583463, + "skip_count": 2.0, + "step": 9234, + "text_loss": 0.5853637456893921 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 4.7191896969000617e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14897248.0, + "repeat_count": 1.0, + "routers_loss": 0.005479716695845127, + "skip_count": 0.0, + "step": 9236, + "text_loss": 0.6206526756286621 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.371000880540066, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 4.706071833637454e-05, + "loss": 0.0059, + "macro_f1": 0.9446290731430054, + "num_tokens": 14900186.0, + "repeat_count": 4.0, + "routers_loss": 0.013435420580208302, + "skip_count": 3.0, + "step": 9238, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 4.692971327093559e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 14903080.0, + "repeat_count": 1.0, + "routers_loss": 0.007366253528743982, + "skip_count": 4.0, + "step": 9240, + "text_loss": 0.6870771646499634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 4.6798881822885276e-05, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 14906837.0, + "repeat_count": 1.0, + "routers_loss": 0.004979560151696205, + "skip_count": 2.0, + "step": 9242, + "text_loss": 0.46396589279174805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 4.666822404235838e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 14909541.0, + "repeat_count": 0.0, + "routers_loss": 0.00023516178771387786, + "skip_count": 0.0, + "step": 9244, + "text_loss": 0.5960518717765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 4.6537739979423174e-05, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 14912820.0, + "repeat_count": 1.0, + "routers_loss": 0.0014796241885051131, + "skip_count": 1.0, + "step": 9246, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 4.640742968408146e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 14916283.0, + "repeat_count": 0.0, + "routers_loss": 0.001386807532981038, + "skip_count": 0.0, + "step": 9248, + "text_loss": 0.3950015902519226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 43.427355444672735, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.037109375, + "learning_rate": 4.627729320626833e-05, + "loss": 0.0061, + "macro_f1": 0.9452888369560242, + "num_tokens": 14918958.0, + "repeat_count": 1.0, + "routers_loss": 0.020335515961050987, + "skip_count": 4.0, + "step": 9250, + "text_loss": 0.6995832324028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 4.6147330595852354e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14921888.0, + "repeat_count": 0.0, + "routers_loss": 0.005387732293456793, + "skip_count": 2.0, + "step": 9252, + "text_loss": 0.2771800756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 4.601754190263552e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14925135.0, + "repeat_count": 0.0, + "routers_loss": 0.001703745685517788, + "skip_count": 1.0, + "step": 9254, + "text_loss": 0.7100088596343994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 4.5887927176352875e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14929198.0, + "repeat_count": 0.0, + "routers_loss": 0.0058114733546972275, + "skip_count": 2.0, + "step": 9256, + "text_loss": 0.21729083359241486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 4.5758486466673244e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 14932685.0, + "repeat_count": 0.0, + "routers_loss": 0.0026105218566954136, + "skip_count": 0.0, + "step": 9258, + "text_loss": 0.20695121586322784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 4.5629219823198564e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14937901.0, + "repeat_count": 0.0, + "routers_loss": 0.006947176996618509, + "skip_count": 2.0, + "step": 9260, + "text_loss": 0.15886647999286652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 4.550012729546393e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 14941406.0, + "repeat_count": 0.0, + "routers_loss": 0.0011366386897861958, + "skip_count": 0.0, + "step": 9262, + "text_loss": 0.49892309308052063 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 4.537120893293789e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 14944200.0, + "repeat_count": 1.0, + "routers_loss": 0.002686526160687208, + "skip_count": 1.0, + "step": 9264, + "text_loss": 0.6201852560043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 4.5242464785022256e-05, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 14947592.0, + "repeat_count": 0.0, + "routers_loss": 0.0007816873257979751, + "skip_count": 0.0, + "step": 9266, + "text_loss": 0.49434536695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 4.5113894901051944e-05, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 14950382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013167982688173652, + "skip_count": 0.0, + "step": 9268, + "text_loss": 0.696306586265564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.498549933029511e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 14953424.0, + "repeat_count": 0.0, + "routers_loss": 0.006240467075258493, + "skip_count": 3.0, + "step": 9270, + "text_loss": 0.14193731546401978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 4.485727812195339e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14956937.0, + "repeat_count": 0.0, + "routers_loss": 0.006212725769728422, + "skip_count": 2.0, + "step": 9272, + "text_loss": 0.40858668088912964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 4.472923132516132e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 14960398.0, + "repeat_count": 0.0, + "routers_loss": 0.003120801877230406, + "skip_count": 2.0, + "step": 9274, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 4.46013589889866e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14963037.0, + "repeat_count": 0.0, + "routers_loss": 0.0027343074325472116, + "skip_count": 0.0, + "step": 9276, + "text_loss": 0.1420614868402481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 4.4473661162430176e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14965604.0, + "repeat_count": 0.0, + "routers_loss": 0.0006372901843860745, + "skip_count": 0.0, + "step": 9278, + "text_loss": 0.4628531336784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 4.4346137894426155e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14968803.0, + "repeat_count": 0.0, + "routers_loss": 0.0062922025099396706, + "skip_count": 2.0, + "step": 9280, + "text_loss": 0.29813849925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 4.421878923384159e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 14972557.0, + "repeat_count": 0.0, + "routers_loss": 0.006071912590414286, + "skip_count": 2.0, + "step": 9282, + "text_loss": 0.19581027328968048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 4.40916152294768e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14975358.0, + "repeat_count": 1.0, + "routers_loss": 0.001606325968168676, + "skip_count": 0.0, + "step": 9284, + "text_loss": 0.6929896473884583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 4.3964615930065124e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 14978045.0, + "repeat_count": 0.0, + "routers_loss": 0.002845643786713481, + "skip_count": 1.0, + "step": 9286, + "text_loss": 0.49997636675834656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 4.3837791384272744e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 14981606.0, + "repeat_count": 0.0, + "routers_loss": 0.005257320590317249, + "skip_count": 1.0, + "step": 9288, + "text_loss": 0.3391074538230896 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.61520399178163, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 4.3711141640699395e-05, + "loss": 0.0045, + "macro_f1": 0.8820862174034119, + "num_tokens": 14984404.0, + "repeat_count": 2.0, + "routers_loss": 0.02914038859307766, + "skip_count": 2.0, + "step": 9290, + "text_loss": 0.29165980219841003 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 4.3584666747877254e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14987280.0, + "repeat_count": 0.0, + "routers_loss": 0.005831835325807333, + "skip_count": 1.0, + "step": 9292, + "text_loss": 0.5312305688858032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 4.345836675427184e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 14990071.0, + "repeat_count": 0.0, + "routers_loss": 0.0035566375590860844, + "skip_count": 0.0, + "step": 9294, + "text_loss": 0.25595441460609436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 4.333224170828149e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 14993809.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552488561719656, + "skip_count": 0.0, + "step": 9296, + "text_loss": 0.18538808822631836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 43.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 4.3206291658237586e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14996794.0, + "repeat_count": 0.0, + "routers_loss": 0.010047328658401966, + "skip_count": 4.0, + "step": 9298, + "text_loss": 0.37891554832458496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 4.308051665240442e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15000911.0, + "repeat_count": 0.0, + "routers_loss": 0.0030308531131595373, + "skip_count": 0.0, + "step": 9300, + "text_loss": 0.20204831659793854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 4.295491673897922e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15004106.0, + "repeat_count": 0.0, + "routers_loss": 0.003695673542097211, + "skip_count": 1.0, + "step": 9302, + "text_loss": 0.84013831615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 4.282949196609215e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15007482.0, + "repeat_count": 0.0, + "routers_loss": 0.000820459274109453, + "skip_count": 0.0, + "step": 9304, + "text_loss": 0.4521652162075043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 4.2704242381806144e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 15010579.0, + "repeat_count": 0.0, + "routers_loss": 0.006170184817165136, + "skip_count": 1.0, + "step": 9306, + "text_loss": 0.22438007593154907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 43.699735837980626, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.051025390625, + "learning_rate": 4.25791680341171e-05, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 15013835.0, + "repeat_count": 0.0, + "routers_loss": 0.021745599806308746, + "skip_count": 4.0, + "step": 9308, + "text_loss": 0.5847432613372803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 4.245426897095372e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15017268.0, + "repeat_count": 0.0, + "routers_loss": 0.0022570823784917593, + "skip_count": 1.0, + "step": 9310, + "text_loss": 0.345931738615036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 4.232954524017763e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15020095.0, + "repeat_count": 0.0, + "routers_loss": 0.0009895693510770798, + "skip_count": 0.0, + "step": 9312, + "text_loss": 0.5374923944473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 4.220499688958307e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15022763.0, + "repeat_count": 0.0, + "routers_loss": 0.005146807990968227, + "skip_count": 0.0, + "step": 9314, + "text_loss": 0.7208939790725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 4.208062396689738e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 15025926.0, + "repeat_count": 0.0, + "routers_loss": 0.00369556387886405, + "skip_count": 1.0, + "step": 9316, + "text_loss": 0.36686572432518005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 4.1956426519780435e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15029120.0, + "repeat_count": 0.0, + "routers_loss": 0.00971714872866869, + "skip_count": 2.0, + "step": 9318, + "text_loss": 0.20697914063930511 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 4.183240459582488e-05, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 15032000.0, + "repeat_count": 1.0, + "routers_loss": 0.002361048012971878, + "skip_count": 1.0, + "step": 9320, + "text_loss": 0.6737313866615295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 4.1708558242556207e-05, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 15034831.0, + "repeat_count": 0.0, + "routers_loss": 0.001238204538822174, + "skip_count": 0.0, + "step": 9322, + "text_loss": 0.823642373085022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 4.1584887507432556e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15037487.0, + "repeat_count": 0.0, + "routers_loss": 0.005211949814110994, + "skip_count": 1.0, + "step": 9324, + "text_loss": 0.3821350634098053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 4.146139243784475e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15040167.0, + "repeat_count": 0.0, + "routers_loss": 0.007513152435421944, + "skip_count": 0.0, + "step": 9326, + "text_loss": 0.18124167621135712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 4.133807308111637e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15043777.0, + "repeat_count": 0.0, + "routers_loss": 0.0029832208529114723, + "skip_count": 0.0, + "step": 9328, + "text_loss": 0.47313618659973145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 4.1214929484503615e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15046622.0, + "repeat_count": 0.0, + "routers_loss": 0.009155526757240295, + "skip_count": 1.0, + "step": 9330, + "text_loss": 0.20556017756462097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 4.1091961695195304e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15049543.0, + "repeat_count": 0.0, + "routers_loss": 0.003529169363901019, + "skip_count": 0.0, + "step": 9332, + "text_loss": 0.18752245604991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 4.0969169760313005e-05, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 15052924.0, + "repeat_count": 1.0, + "routers_loss": 0.002136822324246168, + "skip_count": 2.0, + "step": 9334, + "text_loss": 0.85563725233078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 4.084655372691076e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 15056579.0, + "repeat_count": 0.0, + "routers_loss": 0.003167972667142749, + "skip_count": 2.0, + "step": 9336, + "text_loss": 0.45709627866744995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 43.8406222483123, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0240478515625, + "learning_rate": 4.07241136419752e-05, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 15059739.0, + "repeat_count": 0.0, + "routers_loss": 0.03742539510130882, + "skip_count": 2.0, + "step": 9338, + "text_loss": 0.19531641900539398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 4.06018495524258e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15062795.0, + "repeat_count": 0.0, + "routers_loss": 0.002699678996577859, + "skip_count": 0.0, + "step": 9340, + "text_loss": 0.31032654643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 4.047976150511423e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15066591.0, + "repeat_count": 0.0, + "routers_loss": 0.0026099481619894505, + "skip_count": 0.0, + "step": 9342, + "text_loss": 0.4676157832145691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 4.035784954682486e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15069509.0, + "repeat_count": 0.0, + "routers_loss": 0.006772278342396021, + "skip_count": 1.0, + "step": 9344, + "text_loss": 0.23385995626449585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 43.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.0236113724274713e-05, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 15072898.0, + "repeat_count": 1.0, + "routers_loss": 0.0005968905170448124, + "skip_count": 0.0, + "step": 9346, + "text_loss": 0.6250094175338745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 4.011455408411302e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15075547.0, + "repeat_count": 0.0, + "routers_loss": 0.012884319759905338, + "skip_count": 2.0, + "step": 9348, + "text_loss": 0.23720405995845795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 3.9993170672921794e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15078902.0, + "repeat_count": 0.0, + "routers_loss": 0.0018171088304370642, + "skip_count": 0.0, + "step": 9350, + "text_loss": 0.23975110054016113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 3.9871963537215284e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 15082292.0, + "repeat_count": 1.0, + "routers_loss": 0.001974726328626275, + "skip_count": 1.0, + "step": 9352, + "text_loss": 0.354034423828125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 3.975093272344038e-05, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 15085288.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760299818590283, + "skip_count": 0.0, + "step": 9354, + "text_loss": 0.6398947834968567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 3.963007827797627e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15089089.0, + "repeat_count": 0.0, + "routers_loss": 0.004467889666557312, + "skip_count": 3.0, + "step": 9356, + "text_loss": 0.26422595977783203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 3.950940024713462e-05, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 15092178.0, + "repeat_count": 0.0, + "routers_loss": 0.0048953029327094555, + "skip_count": 1.0, + "step": 9358, + "text_loss": 0.7519236207008362 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 43.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 3.9388898677159446e-05, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 15094825.0, + "repeat_count": 1.0, + "routers_loss": 0.004229324869811535, + "skip_count": 1.0, + "step": 9360, + "text_loss": 0.522379457950592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 43.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 3.9268573614227146e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15098119.0, + "repeat_count": 0.0, + "routers_loss": 0.0028480603359639645, + "skip_count": 3.0, + "step": 9362, + "text_loss": 0.47443902492523193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 3.914842510444666e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15101362.0, + "repeat_count": 0.0, + "routers_loss": 0.0024998984299600124, + "skip_count": 1.0, + "step": 9364, + "text_loss": 0.6255060434341431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 43.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 3.9028453193859006e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15104544.0, + "repeat_count": 0.0, + "routers_loss": 0.008692052215337753, + "skip_count": 1.0, + "step": 9366, + "text_loss": 0.26974618434906006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 43.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 3.890865792843768e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 15107619.0, + "repeat_count": 0.0, + "routers_loss": 0.002779777627438307, + "skip_count": 2.0, + "step": 9368, + "text_loss": 0.4157184064388275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 43.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 3.878903935408845e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15111352.0, + "repeat_count": 0.0, + "routers_loss": 0.0010220289696007967, + "skip_count": 0.0, + "step": 9370, + "text_loss": 0.5674155950546265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 3.866959751664939e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15114088.0, + "repeat_count": 0.0, + "routers_loss": 0.004387985449284315, + "skip_count": 1.0, + "step": 9372, + "text_loss": 0.3638002276420593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 3.8550332461890824e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15117271.0, + "repeat_count": 0.0, + "routers_loss": 0.0005855522467754781, + "skip_count": 0.0, + "step": 9374, + "text_loss": 0.6257871389389038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.843124423551536e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15119936.0, + "repeat_count": 0.0, + "routers_loss": 0.0026496360078454018, + "skip_count": 0.0, + "step": 9376, + "text_loss": 0.7019506096839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 3.8312332883157774e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15123407.0, + "repeat_count": 0.0, + "routers_loss": 0.0024072150699794292, + "skip_count": 0.0, + "step": 9378, + "text_loss": 0.45380696654319763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 3.819359845038517e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15126742.0, + "repeat_count": 0.0, + "routers_loss": 0.00031929166289046407, + "skip_count": 0.0, + "step": 9380, + "text_loss": 0.5322204828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 3.807504098269682e-05, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 15130854.0, + "repeat_count": 0.0, + "routers_loss": 0.00177620945032686, + "skip_count": 0.0, + "step": 9382, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 44.05635456413267, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.02783203125, + "learning_rate": 3.7956660525524156e-05, + "loss": 0.0071, + "macro_f1": 0.8823530077934265, + "num_tokens": 15135054.0, + "repeat_count": 1.0, + "routers_loss": 0.013358182273805141, + "skip_count": 2.0, + "step": 9384, + "text_loss": 0.39796701073646545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 3.783845712423067e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15139179.0, + "repeat_count": 0.0, + "routers_loss": 0.0030253338627517223, + "skip_count": 0.0, + "step": 9386, + "text_loss": 0.13592341542243958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 3.772043082411236e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15142436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311813580803573, + "skip_count": 0.0, + "step": 9388, + "text_loss": 0.7804215550422668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 3.760258167039704e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15146071.0, + "repeat_count": 0.0, + "routers_loss": 0.012432600371539593, + "skip_count": 1.0, + "step": 9390, + "text_loss": 0.37692421674728394 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8571428656578064, + "avg_layers": 23.0, + "epoch": 44.09392427355445, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9230769276618958, + "grad_norm": 0.053955078125, + "learning_rate": 3.748490970824464e-05, + "loss": 0.0074, + "macro_f1": 0.9662289023399353, + "num_tokens": 15149020.0, + "repeat_count": 1.0, + "routers_loss": 0.03158312290906906, + "skip_count": 7.0, + "step": 9392, + "text_loss": 0.6111845374107361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 3.7367414982747374e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15151887.0, + "repeat_count": 0.0, + "routers_loss": 0.000898235070053488, + "skip_count": 0.0, + "step": 9394, + "text_loss": 0.42988476157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 3.7250097538929384e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15155395.0, + "repeat_count": 0.0, + "routers_loss": 0.0024584042839705944, + "skip_count": 1.0, + "step": 9396, + "text_loss": 0.4083070456981659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 3.713295742174694e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15158275.0, + "repeat_count": 0.0, + "routers_loss": 0.0012269694125279784, + "skip_count": 0.0, + "step": 9398, + "text_loss": 0.529385507106781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 3.701599467608835e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15161533.0, + "repeat_count": 0.0, + "routers_loss": 0.002610012423247099, + "skip_count": 1.0, + "step": 9400, + "text_loss": 0.1785552203655243 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 3.6899209346773986e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 15164799.0, + "repeat_count": 1.0, + "routers_loss": 0.0012146600056439638, + "skip_count": 0.0, + "step": 9402, + "text_loss": 0.9209059476852417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.678260147855628e-05, + "loss": 0.0028, + "macro_f1": 0.6666666865348816, + "num_tokens": 15168111.0, + "repeat_count": 0.0, + "routers_loss": 0.001716976286843419, + "skip_count": 1.0, + "step": 9404, + "text_loss": 0.5762659907341003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 3.6666171116119474e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15171285.0, + "repeat_count": 1.0, + "routers_loss": 0.005656248424202204, + "skip_count": 2.0, + "step": 9406, + "text_loss": 0.3065127432346344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 3.6549918304079946e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15174838.0, + "repeat_count": 0.0, + "routers_loss": 0.002362997969612479, + "skip_count": 2.0, + "step": 9408, + "text_loss": 0.5256759524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 3.643384308698594e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15177713.0, + "repeat_count": 0.0, + "routers_loss": 0.002327109221369028, + "skip_count": 1.0, + "step": 9410, + "text_loss": 0.27613985538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 3.6317945509317716e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15180863.0, + "repeat_count": 1.0, + "routers_loss": 0.008501979522407055, + "skip_count": 0.0, + "step": 9412, + "text_loss": 0.3379829525947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 3.6202225615487525e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15184531.0, + "repeat_count": 0.0, + "routers_loss": 0.004115676507353783, + "skip_count": 0.0, + "step": 9414, + "text_loss": 0.24313601851463318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 3.6086683449839454e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15187699.0, + "repeat_count": 0.0, + "routers_loss": 0.0017425924306735396, + "skip_count": 0.0, + "step": 9416, + "text_loss": 0.47485142946243286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 44.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 3.597131905664935e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 15190528.0, + "repeat_count": 1.0, + "routers_loss": 0.0031498887110501528, + "skip_count": 1.0, + "step": 9418, + "text_loss": 0.5356660485267639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 3.585613248012515e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15194165.0, + "repeat_count": 0.0, + "routers_loss": 0.006833057850599289, + "skip_count": 1.0, + "step": 9420, + "text_loss": 0.21593274176120758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 3.574112376440658e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15197612.0, + "repeat_count": 0.0, + "routers_loss": 0.0013788710348308086, + "skip_count": 1.0, + "step": 9422, + "text_loss": 0.5275097489356995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 3.5626292953565175e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15201103.0, + "repeat_count": 0.0, + "routers_loss": 0.0021296890918165445, + "skip_count": 0.0, + "step": 9424, + "text_loss": 0.3420610725879669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 3.551164009160429e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15204007.0, + "repeat_count": 0.0, + "routers_loss": 0.0025281559210270643, + "skip_count": 0.0, + "step": 9426, + "text_loss": 0.4756413996219635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 3.539716522245917e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15208066.0, + "repeat_count": 0.0, + "routers_loss": 0.0008577071712352335, + "skip_count": 0.0, + "step": 9428, + "text_loss": 0.7672523260116577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 3.528286838999672e-05, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 15211118.0, + "repeat_count": 1.0, + "routers_loss": 0.002977409167215228, + "skip_count": 0.0, + "step": 9430, + "text_loss": 0.5010796785354614 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 3.5168749638015806e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 15214245.0, + "repeat_count": 1.0, + "routers_loss": 0.0009552660631015897, + "skip_count": 0.0, + "step": 9432, + "text_loss": 0.6633321642875671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 3.505480901024677e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15217449.0, + "repeat_count": 0.0, + "routers_loss": 0.005598205607384443, + "skip_count": 2.0, + "step": 9434, + "text_loss": 0.545702338218689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 3.494104655035213e-05, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 15220391.0, + "repeat_count": 0.0, + "routers_loss": 0.0154950562864542, + "skip_count": 4.0, + "step": 9436, + "text_loss": 0.211164191365242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 3.4827462301925735e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15224061.0, + "repeat_count": 0.0, + "routers_loss": 0.001531782210804522, + "skip_count": 0.0, + "step": 9438, + "text_loss": 0.49369096755981445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 3.471405630849328e-05, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 15227586.0, + "repeat_count": 0.0, + "routers_loss": 0.004152537789195776, + "skip_count": 1.0, + "step": 9440, + "text_loss": 0.1624782234430313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 3.4600828613512156e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15230713.0, + "repeat_count": 0.0, + "routers_loss": 0.0026113570202142, + "skip_count": 0.0, + "step": 9442, + "text_loss": 0.1921689808368683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 44.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 3.44877792603715e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 15233925.0, + "repeat_count": 0.0, + "routers_loss": 0.008077848702669144, + "skip_count": 3.0, + "step": 9444, + "text_loss": 0.32417818903923035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 3.437490829239193e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15236684.0, + "repeat_count": 0.0, + "routers_loss": 0.0005273211863823235, + "skip_count": 0.0, + "step": 9446, + "text_loss": 0.3497772812843323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 3.4262215752825895e-05, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 15239866.0, + "repeat_count": 0.0, + "routers_loss": 0.0015295564662665129, + "skip_count": 0.0, + "step": 9448, + "text_loss": 0.7613807320594788 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 3.414970168485737e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15243615.0, + "repeat_count": 1.0, + "routers_loss": 0.0039047773461788893, + "skip_count": 0.0, + "step": 9450, + "text_loss": 0.3325706720352173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.375697094217784, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 3.403736613160191e-05, + "loss": 0.0049, + "macro_f1": 0.32098764181137085, + "num_tokens": 15246714.0, + "repeat_count": 0.0, + "routers_loss": 0.0300968699157238, + "skip_count": 2.0, + "step": 9452, + "text_loss": 0.3441869020462036 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 3.392520913610681e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15249520.0, + "repeat_count": 1.0, + "routers_loss": 0.0037529836408793926, + "skip_count": 0.0, + "step": 9454, + "text_loss": 0.5083104968070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 3.381323074135073e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 15252527.0, + "repeat_count": 0.0, + "routers_loss": 0.0019368440844118595, + "skip_count": 2.0, + "step": 9456, + "text_loss": 0.49744489789009094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 3.3701430990244085e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15255330.0, + "repeat_count": 0.0, + "routers_loss": 0.0033424650318920612, + "skip_count": 1.0, + "step": 9458, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 3.35898099256286e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15257961.0, + "repeat_count": 0.0, + "routers_loss": 0.0006928095244802535, + "skip_count": 0.0, + "step": 9460, + "text_loss": 0.5270714163780212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 3.347836759027789e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15261137.0, + "repeat_count": 0.0, + "routers_loss": 0.0030718250200152397, + "skip_count": 2.0, + "step": 9462, + "text_loss": 0.11651179939508438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.43205165835045, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 3.33671040268968e-05, + "loss": 0.0064, + "macro_f1": 0.6601307392120361, + "num_tokens": 15264234.0, + "repeat_count": 1.0, + "routers_loss": 0.03508305177092552, + "skip_count": 2.0, + "step": 9464, + "text_loss": 0.14562347531318665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.441444085705896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 3.3256019278121717e-05, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 15267047.0, + "repeat_count": 0.0, + "routers_loss": 0.008365205489099026, + "skip_count": 1.0, + "step": 9466, + "text_loss": 0.8550931215286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 3.3145113386520485e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15270442.0, + "repeat_count": 0.0, + "routers_loss": 0.0036910634953528643, + "skip_count": 0.0, + "step": 9468, + "text_loss": 0.24741731584072113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 3.30343863945925e-05, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 15273845.0, + "repeat_count": 0.0, + "routers_loss": 0.0014966290909796953, + "skip_count": 0.0, + "step": 9470, + "text_loss": 0.5137372612953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 3.2923838344768534e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15277940.0, + "repeat_count": 0.0, + "routers_loss": 0.0028104602824896574, + "skip_count": 0.0, + "step": 9472, + "text_loss": 0.5737728476524353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 3.281346927941087e-05, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 15281640.0, + "repeat_count": 0.0, + "routers_loss": 0.007870957255363464, + "skip_count": 2.0, + "step": 9474, + "text_loss": 0.27684518694877625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 3.270327924081301e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15284877.0, + "repeat_count": 0.0, + "routers_loss": 0.006224945653229952, + "skip_count": 0.0, + "step": 9476, + "text_loss": 0.35599255561828613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 3.259326827120013e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15287945.0, + "repeat_count": 0.0, + "routers_loss": 0.001179040758870542, + "skip_count": 0.0, + "step": 9478, + "text_loss": 0.26802319288253784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 3.2483436412728553e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 15290754.0, + "repeat_count": 0.0, + "routers_loss": 0.001992281526327133, + "skip_count": 0.0, + "step": 9480, + "text_loss": 0.40124714374542236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 3.2373783707486057e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15294841.0, + "repeat_count": 0.0, + "routers_loss": 0.0012830843916162848, + "skip_count": 0.0, + "step": 9482, + "text_loss": 0.6739225387573242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 3.226431019749171e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15298397.0, + "repeat_count": 0.0, + "routers_loss": 0.003624147269874811, + "skip_count": 2.0, + "step": 9484, + "text_loss": 0.5250326991081238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 3.2155015924696105e-05, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 15301499.0, + "repeat_count": 0.0, + "routers_loss": 0.0019682408310472965, + "skip_count": 0.0, + "step": 9486, + "text_loss": 0.5574567317962646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 3.204590093098098e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 15304531.0, + "repeat_count": 0.0, + "routers_loss": 0.002245094161480665, + "skip_count": 0.0, + "step": 9488, + "text_loss": 0.4065501093864441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 3.1936965258159366e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15307826.0, + "repeat_count": 0.0, + "routers_loss": 0.002919224789366126, + "skip_count": 1.0, + "step": 9490, + "text_loss": 0.5183609127998352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 3.1828208947975615e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15311420.0, + "repeat_count": 0.0, + "routers_loss": 0.004961747210472822, + "skip_count": 1.0, + "step": 9492, + "text_loss": 0.1962234377861023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 3.171963204210537e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15314196.0, + "repeat_count": 0.0, + "routers_loss": 0.0026044815313071012, + "skip_count": 0.0, + "step": 9494, + "text_loss": 0.223251610994339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 3.161123458215553e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15317174.0, + "repeat_count": 0.0, + "routers_loss": 0.0029661289881914854, + "skip_count": 0.0, + "step": 9496, + "text_loss": 0.32970958948135376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 3.150301660966415e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 15320343.0, + "repeat_count": 0.0, + "routers_loss": 0.0011696632718667388, + "skip_count": 0.0, + "step": 9498, + "text_loss": 0.8590811491012573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 3.13949781661006e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15324138.0, + "repeat_count": 0.0, + "routers_loss": 0.0015035583637654781, + "skip_count": 0.0, + "step": 9500, + "text_loss": 0.6658036708831787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 3.1287119292865375e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15328395.0, + "repeat_count": 0.0, + "routers_loss": 0.001930502592585981, + "skip_count": 0.0, + "step": 9502, + "text_loss": 0.4104210138320923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 3.117944003129025e-05, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 15332196.0, + "repeat_count": 0.0, + "routers_loss": 0.0010025398805737495, + "skip_count": 0.0, + "step": 9504, + "text_loss": 0.7272399663925171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 44.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 3.107194042263806e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15335253.0, + "repeat_count": 1.0, + "routers_loss": 0.004520092159509659, + "skip_count": 0.0, + "step": 9506, + "text_loss": 0.29173022508621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 3.096462050810284e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15338129.0, + "repeat_count": 0.0, + "routers_loss": 0.0009707154240459204, + "skip_count": 0.0, + "step": 9508, + "text_loss": 0.6530287861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 3.0857480328809916e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15341487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008689566748216748, + "skip_count": 0.0, + "step": 9510, + "text_loss": 0.36988505721092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 3.0750519925815565e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 15344460.0, + "repeat_count": 0.0, + "routers_loss": 0.0022587007842957973, + "skip_count": 0.0, + "step": 9512, + "text_loss": 0.2447768598794937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 3.064373934010711e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15348135.0, + "repeat_count": 0.0, + "routers_loss": 0.001986770424991846, + "skip_count": 0.0, + "step": 9514, + "text_loss": 0.43159469962120056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 3.053713861260321e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 15351073.0, + "repeat_count": 0.0, + "routers_loss": 0.0003514432755764574, + "skip_count": 0.0, + "step": 9516, + "text_loss": 0.3638324737548828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 3.043071778415335e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15353633.0, + "repeat_count": 0.0, + "routers_loss": 0.003395392093807459, + "skip_count": 0.0, + "step": 9518, + "text_loss": 0.5728140473365784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 3.03244768955383e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15357322.0, + "repeat_count": 0.0, + "routers_loss": 0.0016641782131046057, + "skip_count": 0.0, + "step": 9520, + "text_loss": 0.666814386844635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 3.021841598746966e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15360771.0, + "repeat_count": 0.0, + "routers_loss": 0.0024721708614379168, + "skip_count": 0.0, + "step": 9522, + "text_loss": 0.7148030400276184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 3.01125351005902e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15364281.0, + "repeat_count": 0.0, + "routers_loss": 0.004133665468543768, + "skip_count": 0.0, + "step": 9524, + "text_loss": 0.2985752820968628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 3.0006834275473737e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15367354.0, + "repeat_count": 0.0, + "routers_loss": 0.003016186412423849, + "skip_count": 1.0, + "step": 9526, + "text_loss": 0.22689883410930634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 44.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01531982421875, + "learning_rate": 2.9901313552624932e-05, + "loss": 0.003, + "macro_f1": 1.0, + "num_tokens": 15371027.0, + "repeat_count": 1.0, + "routers_loss": 0.015333639457821846, + "skip_count": 7.0, + "step": 9528, + "text_loss": 0.8308720588684082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 2.97959729724796e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15373948.0, + "repeat_count": 0.0, + "routers_loss": 0.001420815708115697, + "skip_count": 0.0, + "step": 9530, + "text_loss": 0.5439777970314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 2.9690812575404456e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15377366.0, + "repeat_count": 0.0, + "routers_loss": 0.0007130459416657686, + "skip_count": 0.0, + "step": 9532, + "text_loss": 0.45405295491218567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.76078661579102, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 2.95858324016971e-05, + "loss": 0.0067, + "macro_f1": 0.3272727429866791, + "num_tokens": 15380115.0, + "repeat_count": 1.0, + "routers_loss": 0.04256885498762131, + "skip_count": 0.0, + "step": 9534, + "text_loss": 0.39998912811279297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 2.9481032491586178e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15383205.0, + "repeat_count": 0.0, + "routers_loss": 0.004944019019603729, + "skip_count": 4.0, + "step": 9536, + "text_loss": 0.1882237195968628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 2.937641288523124e-05, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 15386619.0, + "repeat_count": 0.0, + "routers_loss": 0.007820523343980312, + "skip_count": 1.0, + "step": 9538, + "text_loss": 0.26401394605636597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 2.9271973622722603e-05, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 15389135.0, + "repeat_count": 0.0, + "routers_loss": 0.0010751578956842422, + "skip_count": 0.0, + "step": 9540, + "text_loss": 0.39813846349716187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 2.9167714744081643e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15392150.0, + "repeat_count": 0.0, + "routers_loss": 0.0031554463785141706, + "skip_count": 2.0, + "step": 9542, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 2.9063636289260677e-05, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 15394974.0, + "repeat_count": 0.0, + "routers_loss": 0.00287301791831851, + "skip_count": 1.0, + "step": 9544, + "text_loss": 0.176493301987648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 2.8959738298142635e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15398432.0, + "repeat_count": 0.0, + "routers_loss": 0.0011708475649356842, + "skip_count": 0.0, + "step": 9546, + "text_loss": 0.8762983083724976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 2.885602081054145e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15401121.0, + "repeat_count": 0.0, + "routers_loss": 0.003167103510349989, + "skip_count": 1.0, + "step": 9548, + "text_loss": 0.2538717985153198 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 44.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 2.8752483866201885e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 15404105.0, + "repeat_count": 1.0, + "routers_loss": 0.007552143186330795, + "skip_count": 5.0, + "step": 9550, + "text_loss": 0.37045153975486755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 2.8649127504799423e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 15407232.0, + "repeat_count": 1.0, + "routers_loss": 0.007718692068010569, + "skip_count": 2.0, + "step": 9552, + "text_loss": 0.15780900418758392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 2.8545951765940547e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15410425.0, + "repeat_count": 0.0, + "routers_loss": 0.0003527951193973422, + "skip_count": 0.0, + "step": 9554, + "text_loss": 0.5931823253631592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 2.8442956689162193e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15413724.0, + "repeat_count": 0.0, + "routers_loss": 0.00146177364513278, + "skip_count": 0.0, + "step": 9556, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 2.8340142313932448e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15416776.0, + "repeat_count": 0.0, + "routers_loss": 0.0010256811510771513, + "skip_count": 0.0, + "step": 9558, + "text_loss": 0.40814271569252014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 2.823750867964997e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15419815.0, + "repeat_count": 0.0, + "routers_loss": 0.0047921910881996155, + "skip_count": 0.0, + "step": 9560, + "text_loss": 0.28953713178634644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 2.8135055825644072e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 15422806.0, + "repeat_count": 0.0, + "routers_loss": 0.002010057680308819, + "skip_count": 1.0, + "step": 9562, + "text_loss": 0.8377944231033325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 44.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 2.803278379117491e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15425405.0, + "repeat_count": 0.0, + "routers_loss": 0.005009239539504051, + "skip_count": 1.0, + "step": 9564, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 2.793069261543335e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 15428233.0, + "repeat_count": 0.0, + "routers_loss": 0.007967893034219742, + "skip_count": 2.0, + "step": 9566, + "text_loss": 0.49891290068626404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 2.7828782337540882e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 15431095.0, + "repeat_count": 2.0, + "routers_loss": 0.00638923142105341, + "skip_count": 4.0, + "step": 9568, + "text_loss": 0.30928006768226624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 44.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 2.7727052996549763e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15434933.0, + "repeat_count": 0.0, + "routers_loss": 0.0060427505522966385, + "skip_count": 3.0, + "step": 9570, + "text_loss": 0.21274788677692413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 2.762550463144281e-05, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 15437655.0, + "repeat_count": 0.0, + "routers_loss": 0.0012480237055569887, + "skip_count": 0.0, + "step": 9572, + "text_loss": 0.31049492955207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 2.7524137281133567e-05, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 15440643.0, + "repeat_count": 0.0, + "routers_loss": 0.005919245071709156, + "skip_count": 0.0, + "step": 9574, + "text_loss": 0.16459886729717255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 2.7422950984466233e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 15443532.0, + "repeat_count": 0.0, + "routers_loss": 0.0061412835493683815, + "skip_count": 2.0, + "step": 9576, + "text_loss": 0.7102797031402588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 2.7321945780215573e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 15447027.0, + "repeat_count": 0.0, + "routers_loss": 0.001149018993601203, + "skip_count": 0.0, + "step": 9578, + "text_loss": 0.22778025269508362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 2.722112170708696e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15450173.0, + "repeat_count": 0.0, + "routers_loss": 0.002216014079749584, + "skip_count": 0.0, + "step": 9580, + "text_loss": 0.21447396278381348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 44.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 2.7120478803716264e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15452838.0, + "repeat_count": 0.0, + "routers_loss": 0.00498749827966094, + "skip_count": 0.0, + "step": 9582, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 44.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 2.7020017108670246e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 15455928.0, + "repeat_count": 1.0, + "routers_loss": 0.005886784754693508, + "skip_count": 3.0, + "step": 9584, + "text_loss": 0.3929266631603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 2.691973666044589e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15459447.0, + "repeat_count": 0.0, + "routers_loss": 0.0029895263724029064, + "skip_count": 1.0, + "step": 9586, + "text_loss": 0.27535343170166016 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 2.681963749747085e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 15462340.0, + "repeat_count": 1.0, + "routers_loss": 0.0038893253076821566, + "skip_count": 0.0, + "step": 9588, + "text_loss": 0.6950465440750122 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 2.671971965810338e-05, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 15465432.0, + "repeat_count": 1.0, + "routers_loss": 0.0016947018448263407, + "skip_count": 0.0, + "step": 9590, + "text_loss": 0.41451266407966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 2.6619983180632134e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15468300.0, + "repeat_count": 0.0, + "routers_loss": 0.0011597154662013054, + "skip_count": 0.0, + "step": 9592, + "text_loss": 0.5846080780029297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 2.6520428103276316e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15471084.0, + "repeat_count": 0.0, + "routers_loss": 0.005555236246436834, + "skip_count": 2.0, + "step": 9594, + "text_loss": 0.4151473939418793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 2.6421054464185633e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15474348.0, + "repeat_count": 0.0, + "routers_loss": 0.0015279205981642008, + "skip_count": 0.0, + "step": 9596, + "text_loss": 0.28742483258247375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 2.6321862301440234e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15477493.0, + "repeat_count": 0.0, + "routers_loss": 0.0019169533625245094, + "skip_count": 0.0, + "step": 9598, + "text_loss": 0.338019460439682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 2.6222851653050773e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15480257.0, + "repeat_count": 0.0, + "routers_loss": 0.0015131557593122125, + "skip_count": 1.0, + "step": 9600, + "text_loss": 0.5982558727264404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 2.612402255695828e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15482838.0, + "repeat_count": 0.0, + "routers_loss": 0.0026768618263304234, + "skip_count": 0.0, + "step": 9602, + "text_loss": 0.32012176513671875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 2.6025375051034306e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15485746.0, + "repeat_count": 0.0, + "routers_loss": 0.002152341417968273, + "skip_count": 0.0, + "step": 9604, + "text_loss": 0.16942192614078522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 2.5926909173080658e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15488669.0, + "repeat_count": 0.0, + "routers_loss": 0.003325721947476268, + "skip_count": 3.0, + "step": 9606, + "text_loss": 0.47950080037117004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 2.582862496082977e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15491512.0, + "repeat_count": 0.0, + "routers_loss": 0.0023114588111639023, + "skip_count": 1.0, + "step": 9608, + "text_loss": 0.3907585144042969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 2.5730522451944292e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 15494479.0, + "repeat_count": 0.0, + "routers_loss": 0.003140041371807456, + "skip_count": 2.0, + "step": 9610, + "text_loss": 0.198005810379982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 2.5632601684017264e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 15497900.0, + "repeat_count": 0.0, + "routers_loss": 0.0015117402654141188, + "skip_count": 0.0, + "step": 9612, + "text_loss": 0.874154269695282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 2.5534862694572114e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 15501817.0, + "repeat_count": 0.0, + "routers_loss": 0.00551232136785984, + "skip_count": 2.0, + "step": 9614, + "text_loss": 0.1933375597000122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 2.543730552106266e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 15504872.0, + "repeat_count": 0.0, + "routers_loss": 0.001090583624318242, + "skip_count": 0.0, + "step": 9616, + "text_loss": 0.4030717611312866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 2.533993020087294e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15507727.0, + "repeat_count": 0.0, + "routers_loss": 0.007001800462603569, + "skip_count": 0.0, + "step": 9618, + "text_loss": 0.4812186062335968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 2.5242736771317333e-05, + "loss": 0.0025, + "macro_f1": 0.3333333432674408, + "num_tokens": 15510689.0, + "repeat_count": 0.0, + "routers_loss": 0.0016861478798091412, + "skip_count": 0.0, + "step": 9620, + "text_loss": 0.4578339457511902 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.17375990607572, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 2.514572526964065e-05, + "loss": 0.0068, + "macro_f1": 0.8817967176437378, + "num_tokens": 15513419.0, + "repeat_count": 2.0, + "routers_loss": 0.050852373242378235, + "skip_count": 3.0, + "step": 9622, + "text_loss": 0.4038950204849243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 2.5048895733017772e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15516289.0, + "repeat_count": 0.0, + "routers_loss": 0.0015001936117187142, + "skip_count": 0.0, + "step": 9624, + "text_loss": 0.8331962823867798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 2.4952248198554073e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15519476.0, + "repeat_count": 0.0, + "routers_loss": 0.0009114370332099497, + "skip_count": 1.0, + "step": 9626, + "text_loss": 0.4997985363006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 2.4855782703284925e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15523363.0, + "repeat_count": 0.0, + "routers_loss": 0.0011186953634023666, + "skip_count": 0.0, + "step": 9628, + "text_loss": 0.2572024464607239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 45.211329615497505, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0262451171875, + "learning_rate": 2.4759499284176145e-05, + "loss": 0.0059, + "macro_f1": 0.6122449040412903, + "num_tokens": 15526289.0, + "repeat_count": 0.0, + "routers_loss": 0.019600817933678627, + "skip_count": 4.0, + "step": 9630, + "text_loss": 0.6323924660682678 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 45.22072204285295, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 2.466339797812378e-05, + "loss": 0.0065, + "macro_f1": 0.9265305995941162, + "num_tokens": 15530260.0, + "repeat_count": 3.0, + "routers_loss": 0.02459629252552986, + "skip_count": 1.0, + "step": 9632, + "text_loss": 0.1824527233839035 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 2.4567478821954038e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 15533916.0, + "repeat_count": 2.0, + "routers_loss": 0.009077859111130238, + "skip_count": 2.0, + "step": 9634, + "text_loss": 0.4518069326877594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 2.4471741852423235e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15536958.0, + "repeat_count": 1.0, + "routers_loss": 0.002355317585170269, + "skip_count": 0.0, + "step": 9636, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 2.437618710621803e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15540544.0, + "repeat_count": 0.0, + "routers_loss": 0.001198371173813939, + "skip_count": 0.0, + "step": 9638, + "text_loss": 0.4845949709415436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 2.4280814619955128e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15543355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009287866414524615, + "skip_count": 0.0, + "step": 9640, + "text_loss": 0.5979563593864441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 2.4185624430181464e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15547215.0, + "repeat_count": 0.0, + "routers_loss": 0.0028763876762241125, + "skip_count": 0.0, + "step": 9642, + "text_loss": 0.16279318928718567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 2.4090616573374135e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15550412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013361044693738222, + "skip_count": 0.0, + "step": 9644, + "text_loss": 0.2864333987236023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 45.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 2.3995791085940244e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 15553660.0, + "repeat_count": 2.0, + "routers_loss": 0.0019316677935421467, + "skip_count": 0.0, + "step": 9646, + "text_loss": 0.6333117485046387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 2.390114800421722e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15556287.0, + "repeat_count": 0.0, + "routers_loss": 0.0011288017267361283, + "skip_count": 1.0, + "step": 9648, + "text_loss": 0.6050677299499512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 2.380668736447239e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15559246.0, + "repeat_count": 0.0, + "routers_loss": 0.0014249378582462668, + "skip_count": 0.0, + "step": 9650, + "text_loss": 0.9484158754348755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.371240920290324e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 15562251.0, + "repeat_count": 1.0, + "routers_loss": 0.00741320988163352, + "skip_count": 4.0, + "step": 9652, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 2.361831355563726e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15565704.0, + "repeat_count": 1.0, + "routers_loss": 0.000942508690059185, + "skip_count": 0.0, + "step": 9654, + "text_loss": 0.6523539423942566 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 2.352440045873233e-05, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 15568797.0, + "repeat_count": 1.0, + "routers_loss": 0.0064352210611104965, + "skip_count": 4.0, + "step": 9656, + "text_loss": 0.3206343650817871 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 2.3430669948175943e-05, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 15571855.0, + "repeat_count": 1.0, + "routers_loss": 0.0013390982057899237, + "skip_count": 0.0, + "step": 9658, + "text_loss": 0.8397402763366699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 2.3337122059885806e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 15575379.0, + "repeat_count": 0.0, + "routers_loss": 0.0012212366564199328, + "skip_count": 0.0, + "step": 9660, + "text_loss": 0.5116108655929565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 2.324375682970975e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15578108.0, + "repeat_count": 0.0, + "routers_loss": 0.003829900873824954, + "skip_count": 0.0, + "step": 9662, + "text_loss": 0.1423535794019699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 2.3150574293425376e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 15581830.0, + "repeat_count": 1.0, + "routers_loss": 0.012756838463246822, + "skip_count": 1.0, + "step": 9664, + "text_loss": 0.24676625430583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 2.3057574486740507e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15584872.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642473828047514, + "skip_count": 0.0, + "step": 9666, + "text_loss": 0.4851650893688202 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 2.2964757445292806e-05, + "loss": 0.0029, + "macro_f1": 1.0, + "num_tokens": 15588000.0, + "repeat_count": 2.0, + "routers_loss": 0.007441115565598011, + "skip_count": 3.0, + "step": 9668, + "text_loss": 0.6416954398155212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017333984375, + "learning_rate": 2.287212320464993e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15591065.0, + "repeat_count": 0.0, + "routers_loss": 0.0015504831681028008, + "skip_count": 0.0, + "step": 9670, + "text_loss": 0.5852687358856201 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 2.2779671800309433e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 15594631.0, + "repeat_count": 2.0, + "routers_loss": 0.005648284684866667, + "skip_count": 2.0, + "step": 9672, + "text_loss": 0.7172279357910156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 2.2687403267699024e-05, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 15598664.0, + "repeat_count": 1.0, + "routers_loss": 0.003756999270990491, + "skip_count": 2.0, + "step": 9674, + "text_loss": 0.18986566364765167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 2.259531764217604e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15601616.0, + "repeat_count": 0.0, + "routers_loss": 0.002155672525987029, + "skip_count": 0.0, + "step": 9676, + "text_loss": 0.4410690367221832 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 2.250341495902797e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 15604291.0, + "repeat_count": 1.0, + "routers_loss": 0.0020037787035107613, + "skip_count": 0.0, + "step": 9678, + "text_loss": 0.5565816164016724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 2.241169525347203e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15607203.0, + "repeat_count": 0.0, + "routers_loss": 0.0014305647928267717, + "skip_count": 0.0, + "step": 9680, + "text_loss": 0.4879189729690552 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 2.2320158560655447e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 15610475.0, + "repeat_count": 1.0, + "routers_loss": 0.016029199585318565, + "skip_count": 3.0, + "step": 9682, + "text_loss": 0.36342933773994446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 2.2228804915655153e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15613810.0, + "repeat_count": 0.0, + "routers_loss": 0.0023584216833114624, + "skip_count": 0.0, + "step": 9684, + "text_loss": 0.18480375409126282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 2.2137634353478043e-05, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 15617854.0, + "repeat_count": 0.0, + "routers_loss": 0.004325680434703827, + "skip_count": 1.0, + "step": 9686, + "text_loss": 0.5345974564552307 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 2.2046646909060996e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15620874.0, + "repeat_count": 3.0, + "routers_loss": 0.006946994923055172, + "skip_count": 0.0, + "step": 9688, + "text_loss": 0.29016008973121643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 2.195584261727046e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15623875.0, + "repeat_count": 0.0, + "routers_loss": 0.0034732038620859385, + "skip_count": 1.0, + "step": 9690, + "text_loss": 0.2831312119960785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 2.1865221512902766e-05, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 15626371.0, + "repeat_count": 0.0, + "routers_loss": 0.002495788736268878, + "skip_count": 1.0, + "step": 9692, + "text_loss": 0.6090453267097473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.511887290871734, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 2.1774783630684246e-05, + "loss": 0.0076, + "macro_f1": 0.6598639488220215, + "num_tokens": 15630129.0, + "repeat_count": 3.0, + "routers_loss": 0.017551302909851074, + "skip_count": 1.0, + "step": 9694, + "text_loss": 0.5127915740013123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 2.168452900527068e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15633179.0, + "repeat_count": 0.0, + "routers_loss": 0.0004413482965901494, + "skip_count": 0.0, + "step": 9696, + "text_loss": 0.5901434421539307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 2.159445767124796e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15636508.0, + "repeat_count": 0.0, + "routers_loss": 0.005992567166686058, + "skip_count": 1.0, + "step": 9698, + "text_loss": 0.8493689298629761 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 2.1504569663131523e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15639371.0, + "repeat_count": 1.0, + "routers_loss": 0.0033268092665821314, + "skip_count": 0.0, + "step": 9700, + "text_loss": 0.2814267873764038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.1414865015366548e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 15643025.0, + "repeat_count": 0.0, + "routers_loss": 0.004418607335537672, + "skip_count": 0.0, + "step": 9702, + "text_loss": 0.2619725167751312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 45.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 2.1325343762328197e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15646996.0, + "repeat_count": 0.0, + "routers_loss": 0.0050115580670535564, + "skip_count": 4.0, + "step": 9704, + "text_loss": 0.8204038143157959 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 2.123600593832109e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15650194.0, + "repeat_count": 0.0, + "routers_loss": 0.0018730501178652048, + "skip_count": 1.0, + "step": 9706, + "text_loss": 0.694500744342804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 2.1146851577579673e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15653743.0, + "repeat_count": 0.0, + "routers_loss": 0.0016657712403684855, + "skip_count": 0.0, + "step": 9708, + "text_loss": 0.8211735486984253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 2.1057880714268064e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 15657325.0, + "repeat_count": 0.0, + "routers_loss": 0.0029736643191426992, + "skip_count": 0.0, + "step": 9710, + "text_loss": 0.2846751809120178 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 2.0969093382479987e-05, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 15660522.0, + "repeat_count": 1.0, + "routers_loss": 0.01233653537929058, + "skip_count": 4.0, + "step": 9712, + "text_loss": 0.23991759121418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 2.0880489616239062e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15663254.0, + "repeat_count": 0.0, + "routers_loss": 0.0012792183551937342, + "skip_count": 0.0, + "step": 9714, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 2.0792069449498297e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 15666283.0, + "repeat_count": 0.0, + "routers_loss": 0.0033134319819509983, + "skip_count": 0.0, + "step": 9716, + "text_loss": 0.4161235988140106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 45.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 2.0703832916140476e-05, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 15669774.0, + "repeat_count": 2.0, + "routers_loss": 0.006201022770255804, + "skip_count": 1.0, + "step": 9718, + "text_loss": 0.42691144347190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 2.061578004997805e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 15672943.0, + "repeat_count": 0.0, + "routers_loss": 0.0033355073537677526, + "skip_count": 1.0, + "step": 9720, + "text_loss": 0.9724727869033813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 2.0527910884753033e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15677847.0, + "repeat_count": 0.0, + "routers_loss": 0.0019593657925724983, + "skip_count": 0.0, + "step": 9722, + "text_loss": 0.417218416929245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 2.0440225454137097e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 15681460.0, + "repeat_count": 0.0, + "routers_loss": 0.007862947881221771, + "skip_count": 2.0, + "step": 9724, + "text_loss": 0.24983589351177216 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 2.0352723791731364e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 15685496.0, + "repeat_count": 1.0, + "routers_loss": 0.004811233840882778, + "skip_count": 0.0, + "step": 9726, + "text_loss": 0.32930606603622437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8571428656578064, + "avg_layers": 22.0, + "epoch": 45.671558555914295, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 0.9230769276618958, + "grad_norm": 0.045166015625, + "learning_rate": 2.0265405931066626e-05, + "loss": 0.0057, + "macro_f1": 0.633273720741272, + "num_tokens": 15688661.0, + "repeat_count": 0.0, + "routers_loss": 0.02648334763944149, + "skip_count": 7.0, + "step": 9728, + "text_loss": 0.42316386103630066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 2.0178271905603395e-05, + "loss": 0.0054, + "macro_f1": 0.6598639488220215, + "num_tokens": 15692778.0, + "repeat_count": 1.0, + "routers_loss": 0.04439396783709526, + "skip_count": 3.0, + "step": 9730, + "text_loss": 0.32248371839523315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 2.0091321748731517e-05, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 15695821.0, + "repeat_count": 0.0, + "routers_loss": 0.0020437403582036495, + "skip_count": 2.0, + "step": 9732, + "text_loss": 0.5959160923957825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 2.000455549377045e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 15699324.0, + "repeat_count": 0.0, + "routers_loss": 0.0002844796108547598, + "skip_count": 0.0, + "step": 9734, + "text_loss": 0.45465928316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.9917973173969204e-05, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 15702044.0, + "repeat_count": 0.0, + "routers_loss": 0.003548701060935855, + "skip_count": 0.0, + "step": 9736, + "text_loss": 0.7129027843475342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 45.71852069269152, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0279541015625, + "learning_rate": 1.9831574822506248e-05, + "loss": 0.0089, + "macro_f1": 0.6289562582969666, + "num_tokens": 15705474.0, + "repeat_count": 0.0, + "routers_loss": 0.023800918832421303, + "skip_count": 6.0, + "step": 9738, + "text_loss": 0.28479668498039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 1.9745360472489648e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15708323.0, + "repeat_count": 0.0, + "routers_loss": 0.01043168269097805, + "skip_count": 2.0, + "step": 9740, + "text_loss": 0.4760739803314209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 45.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 1.9659330156956867e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 15711390.0, + "repeat_count": 0.0, + "routers_loss": 0.006430295296013355, + "skip_count": 2.0, + "step": 9742, + "text_loss": 0.13933971524238586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.957348390887487e-05, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 15714077.0, + "repeat_count": 0.0, + "routers_loss": 0.005738302133977413, + "skip_count": 3.0, + "step": 9744, + "text_loss": 0.49661460518836975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.948782176114017e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15716818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011776578612625599, + "skip_count": 0.0, + "step": 9746, + "text_loss": 0.36066678166389465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 1.9402343746578567e-05, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 15720756.0, + "repeat_count": 0.0, + "routers_loss": 0.0005322427023202181, + "skip_count": 0.0, + "step": 9748, + "text_loss": 0.5549091696739197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.931704989794547e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15724516.0, + "repeat_count": 0.0, + "routers_loss": 0.001399765140376985, + "skip_count": 0.0, + "step": 9750, + "text_loss": 0.21269696950912476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.9231940247925572e-05, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 15727142.0, + "repeat_count": 0.0, + "routers_loss": 0.0018337799701839685, + "skip_count": 1.0, + "step": 9752, + "text_loss": 0.18105024099349976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 1.914701482913317e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 15730023.0, + "repeat_count": 0.0, + "routers_loss": 0.0010057559702545404, + "skip_count": 0.0, + "step": 9754, + "text_loss": 0.477859228849411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 45.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 1.906227367411173e-05, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 15733108.0, + "repeat_count": 0.0, + "routers_loss": 0.002486895304173231, + "skip_count": 3.0, + "step": 9756, + "text_loss": 0.4802452027797699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 45.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.8977716815334335e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15736130.0, + "repeat_count": 1.0, + "routers_loss": 0.004353616386651993, + "skip_count": 0.0, + "step": 9758, + "text_loss": 0.5479429960250854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 1.8893344285203228e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15738691.0, + "repeat_count": 0.0, + "routers_loss": 0.0031500225886702538, + "skip_count": 1.0, + "step": 9760, + "text_loss": 0.6871381402015686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 1.8809156116050164e-05, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 15741682.0, + "repeat_count": 0.0, + "routers_loss": 0.0023419202771037817, + "skip_count": 0.0, + "step": 9762, + "text_loss": 0.6725277900695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.8725152340136163e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15745314.0, + "repeat_count": 0.0, + "routers_loss": 0.0018769606249406934, + "skip_count": 0.0, + "step": 9764, + "text_loss": 0.4549144506454468 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 1.864133298965176e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 15747982.0, + "repeat_count": 1.0, + "routers_loss": 0.0030958254355937243, + "skip_count": 2.0, + "step": 9766, + "text_loss": 0.4970727264881134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.8557698096716534e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15750453.0, + "repeat_count": 0.0, + "routers_loss": 0.0020812496077269316, + "skip_count": 1.0, + "step": 9768, + "text_loss": 0.7540801167488098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 1.847424769337963e-05, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 15753857.0, + "repeat_count": 0.0, + "routers_loss": 0.0031040434259921312, + "skip_count": 0.0, + "step": 9770, + "text_loss": 0.5154248476028442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 1.8390981811619356e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 15756742.0, + "repeat_count": 0.0, + "routers_loss": 0.002128311200067401, + "skip_count": 0.0, + "step": 9772, + "text_loss": 0.7327702045440674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.8307900483343354e-05, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 15759833.0, + "repeat_count": 0.0, + "routers_loss": 0.003279880853369832, + "skip_count": 1.0, + "step": 9774, + "text_loss": 0.2673797607421875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.8225003740388545e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15762768.0, + "repeat_count": 0.0, + "routers_loss": 0.004170822445303202, + "skip_count": 0.0, + "step": 9776, + "text_loss": 0.1820847988128662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 45.90636923980041, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.0194091796875, + "learning_rate": 1.8142291614521132e-05, + "loss": 0.0045, + "macro_f1": 0.9713832139968872, + "num_tokens": 15766965.0, + "repeat_count": 1.0, + "routers_loss": 0.022715313360095024, + "skip_count": 9.0, + "step": 9778, + "text_loss": 0.5590897798538208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 1.8059764137436596e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15770199.0, + "repeat_count": 0.0, + "routers_loss": 0.007280370220541954, + "skip_count": 1.0, + "step": 9780, + "text_loss": 0.28117987513542175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 1.7977421340759582e-05, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 15773367.0, + "repeat_count": 0.0, + "routers_loss": 0.003529706271365285, + "skip_count": 0.0, + "step": 9782, + "text_loss": 0.18752245604991913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 1.7895263256044013e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 15776976.0, + "repeat_count": 0.0, + "routers_loss": 0.0025916248559951782, + "skip_count": 1.0, + "step": 9784, + "text_loss": 0.6330561637878418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 45.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 1.781328991477299e-05, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 15780848.0, + "repeat_count": 0.0, + "routers_loss": 0.0049234069883823395, + "skip_count": 1.0, + "step": 9786, + "text_loss": 0.15685316920280457 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 45.95333137657764, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 1.7731501348358882e-05, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 15783808.0, + "repeat_count": 2.0, + "routers_loss": 0.011918511241674423, + "skip_count": 1.0, + "step": 9788, + "text_loss": 0.23963648080825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.7649897588143226e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 15787421.0, + "repeat_count": 0.0, + "routers_loss": 0.0018508053617551923, + "skip_count": 0.0, + "step": 9790, + "text_loss": 0.49311593174934387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.7568478665396736e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 15790274.0, + "repeat_count": 0.0, + "routers_loss": 0.0006157457246445119, + "skip_count": 0.0, + "step": 9792, + "text_loss": 0.4567435085773468 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 45.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 1.7487244611319285e-05, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 15794462.0, + "repeat_count": 3.0, + "routers_loss": 0.0031584864482283592, + "skip_count": 0.0, + "step": 9794, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 45.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 1.740619545703992e-05, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 15797775.0, + "repeat_count": 0.0, + "routers_loss": 0.0028455168940126896, + "skip_count": 0.0, + "step": 9796, + "text_loss": 0.1487245261669159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 46.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06201171875, + "learning_rate": 1.7325331233616847e-05, + "loss": 0.0078, + "macro_f1": 0.6122449040412903, + "num_tokens": 15801092.0, + "repeat_count": 0.0, + "routers_loss": 0.02560117095708847, + "skip_count": 4.0, + "step": 9798, + "text_loss": 0.5299228429794312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.00939242735544, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.7244651972037284e-05, + "loss": 0.0046, + "macro_f1": 0.6598639488220215, + "num_tokens": 15804049.0, + "repeat_count": 1.0, + "routers_loss": 0.010446238331496716, + "skip_count": 3.0, + "step": 9800, + "text_loss": 0.6591248512268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 1.7164157703217886e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 15807683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017791346181184053, + "skip_count": 0.0, + "step": 9802, + "text_loss": 0.45421653985977173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 1.7083848458004035e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 15810743.0, + "repeat_count": 0.0, + "routers_loss": 0.0008831496234051883, + "skip_count": 0.0, + "step": 9804, + "text_loss": 0.5535439848899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 1.7003724267170394e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 15813880.0, + "repeat_count": 0.0, + "routers_loss": 0.002800740534439683, + "skip_count": 0.0, + "step": 9806, + "text_loss": 0.5228974223136902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 1.6923785161420845e-05, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 15816808.0, + "repeat_count": 0.0, + "routers_loss": 0.006823428440839052, + "skip_count": 3.0, + "step": 9808, + "text_loss": 0.48018959164619446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 1.6844031171388052e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15819803.0, + "repeat_count": 0.0, + "routers_loss": 0.004808149300515652, + "skip_count": 0.0, + "step": 9810, + "text_loss": 0.31094294786453247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 1.6764462327633955e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 15822861.0, + "repeat_count": 0.0, + "routers_loss": 0.0026099751703441143, + "skip_count": 0.0, + "step": 9812, + "text_loss": 0.5534207224845886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 1.668507866064939e-05, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 15825960.0, + "repeat_count": 1.0, + "routers_loss": 0.008356450125575066, + "skip_count": 2.0, + "step": 9814, + "text_loss": 0.40162262320518494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 1.660588020085452e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 15828906.0, + "repeat_count": 0.0, + "routers_loss": 0.006548966746777296, + "skip_count": 2.0, + "step": 9816, + "text_loss": 0.2071811705827713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 1.652686697859823e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 15831935.0, + "repeat_count": 0.0, + "routers_loss": 0.0007895465241745114, + "skip_count": 0.0, + "step": 9818, + "text_loss": 0.6879562735557556 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 1.6448039024158534e-05, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 15835745.0, + "repeat_count": 1.0, + "routers_loss": 0.00370208453387022, + "skip_count": 2.0, + "step": 9820, + "text_loss": 0.6139163970947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.6369396367742483e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 15838373.0, + "repeat_count": 0.0, + "routers_loss": 0.002627170644700527, + "skip_count": 0.0, + "step": 9822, + "text_loss": 0.3881947100162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018798828125, + "learning_rate": 1.6290939039486084e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 15841156.0, + "repeat_count": 0.0, + "routers_loss": 0.005191941745579243, + "skip_count": 2.0, + "step": 9824, + "text_loss": 0.6564247608184814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 1.621266706945429e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15843877.0, + "repeat_count": 1.0, + "routers_loss": 0.003889352548867464, + "skip_count": 0.0, + "step": 9826, + "text_loss": 0.7128682136535645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 46.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 1.6134580487641047e-05, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 15846880.0, + "repeat_count": 0.0, + "routers_loss": 0.00674893194809556, + "skip_count": 4.0, + "step": 9828, + "text_loss": 0.30893367528915405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 1.6056679323969425e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 15850130.0, + "repeat_count": 0.0, + "routers_loss": 0.0009898045100271702, + "skip_count": 0.0, + "step": 9830, + "text_loss": 0.6550688743591309 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 1.5978963608291154e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15853578.0, + "repeat_count": 1.0, + "routers_loss": 0.0046016750857234, + "skip_count": 0.0, + "step": 9832, + "text_loss": 0.43872204422950745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 1.5901433370387132e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15857939.0, + "repeat_count": 0.0, + "routers_loss": 0.004589201882481575, + "skip_count": 1.0, + "step": 9834, + "text_loss": 0.41940808296203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 1.5824088639967094e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 15860584.0, + "repeat_count": 0.0, + "routers_loss": 0.0018899316200986505, + "skip_count": 1.0, + "step": 9836, + "text_loss": 0.5105440616607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 1.5746929446669556e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 15864386.0, + "repeat_count": 0.0, + "routers_loss": 0.0006366848247125745, + "skip_count": 0.0, + "step": 9838, + "text_loss": 0.5686481595039368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 1.5669955820062254e-05, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 15869103.0, + "repeat_count": 0.0, + "routers_loss": 0.0043256948702037334, + "skip_count": 1.0, + "step": 9840, + "text_loss": 0.16309607028961182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 1.5593167789641483e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15872384.0, + "repeat_count": 0.0, + "routers_loss": 0.00406000716611743, + "skip_count": 1.0, + "step": 9842, + "text_loss": 0.21662485599517822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 46.21602582917523, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029541015625, + "learning_rate": 1.551656538483259e-05, + "loss": 0.0076, + "macro_f1": 0.5492662787437439, + "num_tokens": 15875261.0, + "repeat_count": 0.0, + "routers_loss": 0.020087692886590958, + "skip_count": 2.0, + "step": 9844, + "text_loss": 0.6189377903938293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 1.5440148634989826e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 15878132.0, + "repeat_count": 0.0, + "routers_loss": 0.0005302145145833492, + "skip_count": 0.0, + "step": 9846, + "text_loss": 0.34496018290519714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 1.536391756939609e-05, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 15881381.0, + "repeat_count": 0.0, + "routers_loss": 0.008405420929193497, + "skip_count": 2.0, + "step": 9848, + "text_loss": 0.2865080237388611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 1.528787221726341e-05, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 15884621.0, + "repeat_count": 0.0, + "routers_loss": 0.0016017532907426357, + "skip_count": 0.0, + "step": 9850, + "text_loss": 0.6104921102523804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 1.5212012607732528e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 15888157.0, + "repeat_count": 0.0, + "routers_loss": 0.0015318389050662518, + "skip_count": 0.0, + "step": 9852, + "text_loss": 0.2622036933898926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 1.5136338769872915e-05, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 15891080.0, + "repeat_count": 2.0, + "routers_loss": 0.006494096480309963, + "skip_count": 4.0, + "step": 9854, + "text_loss": 0.23415961861610413 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 1.5060850732682928e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 15895486.0, + "repeat_count": 2.0, + "routers_loss": 0.007511078380048275, + "skip_count": 3.0, + "step": 9856, + "text_loss": 0.7389219999313354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.4985548525089709e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 15898747.0, + "repeat_count": 0.0, + "routers_loss": 0.004874013364315033, + "skip_count": 2.0, + "step": 9858, + "text_loss": 0.6853085160255432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 1.4910432175949285e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 15902157.0, + "repeat_count": 0.0, + "routers_loss": 0.0009244410903193057, + "skip_count": 0.0, + "step": 9860, + "text_loss": 0.8172202110290527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.4835501714046296e-05, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 15905012.0, + "repeat_count": 0.0, + "routers_loss": 0.00456853536888957, + "skip_count": 3.0, + "step": 9862, + "text_loss": 0.7527797818183899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 1.4760757168094275e-05, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 15908302.0, + "repeat_count": 0.0, + "routers_loss": 0.0009686833946034312, + "skip_count": 0.0, + "step": 9864, + "text_loss": 0.5548131465911865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 1.4686198566735531e-05, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 15911923.0, + "repeat_count": 0.0, + "routers_loss": 0.0008255072170868516, + "skip_count": 0.0, + "step": 9866, + "text_loss": 0.5995872020721436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 1.4611825938540935e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 15914858.0, + "repeat_count": 0.0, + "routers_loss": 0.002459712326526642, + "skip_count": 0.0, + "step": 9868, + "text_loss": 0.6777655482292175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 1.4537639312010298e-05, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 15918091.0, + "repeat_count": 0.0, + "routers_loss": 0.0014664786867797375, + "skip_count": 0.0, + "step": 9870, + "text_loss": 0.42750120162963867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.4463638715572103e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15920943.0, + "repeat_count": 1.0, + "routers_loss": 0.005549794062972069, + "skip_count": 1.0, + "step": 9872, + "text_loss": 0.27477580308914185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.4389824177583388e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 15924212.0, + "repeat_count": 0.0, + "routers_loss": 0.007967505604028702, + "skip_count": 2.0, + "step": 9874, + "text_loss": 0.3174900412559509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 1.4316195726330139e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 15929143.0, + "repeat_count": 0.0, + "routers_loss": 0.0014913028571754694, + "skip_count": 2.0, + "step": 9876, + "text_loss": 0.40919792652130127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 1.4242753390026953e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15931702.0, + "repeat_count": 0.0, + "routers_loss": 0.0003994424478150904, + "skip_count": 0.0, + "step": 9878, + "text_loss": 0.35346853733062744 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 1.4169497196816983e-05, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 15935225.0, + "repeat_count": 1.0, + "routers_loss": 0.008424114435911179, + "skip_count": 3.0, + "step": 9880, + "text_loss": 0.230825275182724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 1.4096427174772164e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 15938630.0, + "repeat_count": 0.0, + "routers_loss": 0.004314251709729433, + "skip_count": 1.0, + "step": 9882, + "text_loss": 0.8749642968177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 1.4023543351893043e-05, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 15941779.0, + "repeat_count": 0.0, + "routers_loss": 0.0008999531855806708, + "skip_count": 0.0, + "step": 9884, + "text_loss": 0.6549318432807922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 1.3950845756108943e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 15944779.0, + "repeat_count": 0.0, + "routers_loss": 0.0010829231468960643, + "skip_count": 0.0, + "step": 9886, + "text_loss": 0.5681273341178894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.3878334415277583e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 15947757.0, + "repeat_count": 0.0, + "routers_loss": 0.0038863453082740307, + "skip_count": 1.0, + "step": 9888, + "text_loss": 0.4282133877277374 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 1.3806009357185512e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 15952223.0, + "repeat_count": 1.0, + "routers_loss": 0.0006428947090171278, + "skip_count": 0.0, + "step": 9890, + "text_loss": 0.4455379247665405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 1.3733870609547838e-05, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 15955968.0, + "repeat_count": 0.0, + "routers_loss": 0.00048406270798295736, + "skip_count": 0.0, + "step": 9892, + "text_loss": 0.37554407119750977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 1.3661918200008228e-05, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 15959376.0, + "repeat_count": 0.0, + "routers_loss": 0.004503594245761633, + "skip_count": 1.0, + "step": 9894, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.3590152156139012e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 15962882.0, + "repeat_count": 0.0, + "routers_loss": 0.0011738749453797936, + "skip_count": 0.0, + "step": 9896, + "text_loss": 0.4203954041004181 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.3518572505440973e-05, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 15965816.0, + "repeat_count": 1.0, + "routers_loss": 0.00806320272386074, + "skip_count": 2.0, + "step": 9898, + "text_loss": 0.18884631991386414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 1.3447179275343779e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 15968840.0, + "repeat_count": 0.0, + "routers_loss": 0.004962162580341101, + "skip_count": 1.0, + "step": 9900, + "text_loss": 0.22457796335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 1.3375972493205268e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 15972768.0, + "repeat_count": 0.0, + "routers_loss": 0.0025535912718623877, + "skip_count": 0.0, + "step": 9902, + "text_loss": 0.14859545230865479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 1.3304952186312114e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 15975380.0, + "repeat_count": 0.0, + "routers_loss": 0.002036662772297859, + "skip_count": 0.0, + "step": 9904, + "text_loss": 0.5820382833480835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.3234118381879378e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 15978335.0, + "repeat_count": 0.0, + "routers_loss": 0.0055219330824911594, + "skip_count": 2.0, + "step": 9906, + "text_loss": 0.29671815037727356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.316347110705074e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 15982003.0, + "repeat_count": 0.0, + "routers_loss": 0.005196230486035347, + "skip_count": 0.0, + "step": 9908, + "text_loss": 0.5204919576644897 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 1.3093010388898319e-05, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 15984937.0, + "repeat_count": 1.0, + "routers_loss": 0.0032779101748019457, + "skip_count": 2.0, + "step": 9910, + "text_loss": 0.6803483366966248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 1.3022736254422851e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 15988992.0, + "repeat_count": 0.0, + "routers_loss": 0.002347869798541069, + "skip_count": 0.0, + "step": 9912, + "text_loss": 0.5335546731948853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 1.2952648730553462e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 15992828.0, + "repeat_count": 0.0, + "routers_loss": 0.0011128517799079418, + "skip_count": 0.0, + "step": 9914, + "text_loss": 0.686739981174469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 1.288274784414789e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 15995984.0, + "repeat_count": 0.0, + "routers_loss": 0.0031158174388110638, + "skip_count": 0.0, + "step": 9916, + "text_loss": 0.16102474927902222 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 1.2813033621992264e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 15999606.0, + "repeat_count": 0.0, + "routers_loss": 0.0029228583443909883, + "skip_count": 1.0, + "step": 9918, + "text_loss": 0.6022558212280273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.274350609080116e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 16002456.0, + "repeat_count": 0.0, + "routers_loss": 0.0031404250767081976, + "skip_count": 2.0, + "step": 9920, + "text_loss": 0.7529577016830444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 1.2674165277217653e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 16005547.0, + "repeat_count": 0.0, + "routers_loss": 0.0038669302593916655, + "skip_count": 0.0, + "step": 9922, + "text_loss": 0.47488540410995483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 1.2605011207813378e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16009520.0, + "repeat_count": 0.0, + "routers_loss": 0.004838052671402693, + "skip_count": 0.0, + "step": 9924, + "text_loss": 0.5252779722213745 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 1.2536043909088191e-05, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 16012730.0, + "repeat_count": 1.0, + "routers_loss": 0.0017430823063477874, + "skip_count": 0.0, + "step": 9926, + "text_loss": 0.40845534205436707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 1.2467263407470619e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16015940.0, + "repeat_count": 0.0, + "routers_loss": 0.0010244545992463827, + "skip_count": 0.0, + "step": 9928, + "text_loss": 0.8465730547904968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 1.2398669729317357e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16018851.0, + "repeat_count": 0.0, + "routers_loss": 0.0007380630704574287, + "skip_count": 0.0, + "step": 9930, + "text_loss": 0.37603214383125305 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.629292632814796, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 1.2330262900913657e-05, + "loss": 0.0087, + "macro_f1": 0.9539539813995361, + "num_tokens": 16022351.0, + "repeat_count": 5.0, + "routers_loss": 0.053848277777433395, + "skip_count": 5.0, + "step": 9932, + "text_loss": 0.2047014981508255 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 1.2262042948473163e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16024902.0, + "repeat_count": 1.0, + "routers_loss": 0.0020845322869718075, + "skip_count": 0.0, + "step": 9934, + "text_loss": 0.6269918084144592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 1.2194009898137903e-05, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 16028056.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686805376783013, + "skip_count": 0.0, + "step": 9936, + "text_loss": 0.4100899398326874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 46.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.212616377597825e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16032111.0, + "repeat_count": 0.0, + "routers_loss": 0.004883588291704655, + "skip_count": 3.0, + "step": 9938, + "text_loss": 0.3921346664428711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 1.2058504607993015e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 16035872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005067490856163204, + "skip_count": 0.0, + "step": 9940, + "text_loss": 0.44368258118629456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 1.1991032420109238e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 16038923.0, + "repeat_count": 0.0, + "routers_loss": 0.005819452460855246, + "skip_count": 2.0, + "step": 9942, + "text_loss": 0.27500197291374207 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.685647196947464, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.1923747238182403e-05, + "loss": 0.0059, + "macro_f1": 0.8817967176437378, + "num_tokens": 16041803.0, + "repeat_count": 2.0, + "routers_loss": 0.035794492810964584, + "skip_count": 3.0, + "step": 9944, + "text_loss": 0.5083543062210083 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.1856649087996384e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 16045258.0, + "repeat_count": 1.0, + "routers_loss": 0.002845201175659895, + "skip_count": 2.0, + "step": 9946, + "text_loss": 0.6859534382820129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 1.1789737995263228e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 16048618.0, + "repeat_count": 0.0, + "routers_loss": 0.0007575460476800799, + "skip_count": 0.0, + "step": 9948, + "text_loss": 0.4512535333633423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 1.1723013985623477e-05, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 16051595.0, + "repeat_count": 0.0, + "routers_loss": 0.002697878750041127, + "skip_count": 1.0, + "step": 9950, + "text_loss": 0.3572070300579071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 1.16564770846459e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 16054494.0, + "repeat_count": 0.0, + "routers_loss": 0.0062429774552583694, + "skip_count": 1.0, + "step": 9952, + "text_loss": 0.5479834079742432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 1.1590127317827492e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 16057555.0, + "repeat_count": 0.0, + "routers_loss": 0.0009302232647314668, + "skip_count": 0.0, + "step": 9954, + "text_loss": 0.44800761342048645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.1523964710593637e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 16061072.0, + "repeat_count": 0.0, + "routers_loss": 0.002112898975610733, + "skip_count": 0.0, + "step": 9956, + "text_loss": 0.3274081349372864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 1.1457989288297942e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 16064165.0, + "repeat_count": 0.0, + "routers_loss": 0.00028447998920455575, + "skip_count": 0.0, + "step": 9958, + "text_loss": 0.5712385773658752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 1.1392201076222352e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 16067293.0, + "repeat_count": 1.0, + "routers_loss": 0.009599249809980392, + "skip_count": 2.0, + "step": 9960, + "text_loss": 0.26818037033081055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 1.132660009957709e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 16069852.0, + "repeat_count": 0.0, + "routers_loss": 0.005338563583791256, + "skip_count": 0.0, + "step": 9962, + "text_loss": 0.6658869981765747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 1.1261186383500487e-05, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16072633.0, + "repeat_count": 0.0, + "routers_loss": 0.001175224082544446, + "skip_count": 1.0, + "step": 9964, + "text_loss": 0.4461731016635895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 1.1195959953059221e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16076065.0, + "repeat_count": 1.0, + "routers_loss": 0.0036650802940130234, + "skip_count": 0.0, + "step": 9966, + "text_loss": 0.6107141971588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 1.113092083324818e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 16079309.0, + "repeat_count": 0.0, + "routers_loss": 0.005924097262322903, + "skip_count": 2.0, + "step": 9968, + "text_loss": 0.5104627013206482 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 46.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 1.1066069048990545e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 16082180.0, + "repeat_count": 3.0, + "routers_loss": 0.010777595452964306, + "skip_count": 0.0, + "step": 9970, + "text_loss": 0.5205907225608826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 1.100140462513749e-05, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 16084654.0, + "repeat_count": 0.0, + "routers_loss": 0.0019593914039433002, + "skip_count": 0.0, + "step": 9972, + "text_loss": 0.36411789059638977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 46.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0255126953125, + "learning_rate": 1.0936927586468693e-05, + "loss": 0.0048, + "macro_f1": 0.9452888369560242, + "num_tokens": 16087736.0, + "repeat_count": 1.0, + "routers_loss": 0.0233579371124506, + "skip_count": 4.0, + "step": 9974, + "text_loss": 0.267604261636734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 1.0872637957691833e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 16090838.0, + "repeat_count": 0.0, + "routers_loss": 0.00034629934816621244, + "skip_count": 0.0, + "step": 9976, + "text_loss": 0.576068103313446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 1.0808535763442761e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16094084.0, + "repeat_count": 0.0, + "routers_loss": 0.0004253332444932312, + "skip_count": 0.0, + "step": 9978, + "text_loss": 0.5883988738059998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 1.0744621028285662e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16097432.0, + "repeat_count": 0.0, + "routers_loss": 0.0005800648941658437, + "skip_count": 0.0, + "step": 9980, + "text_loss": 0.3358926475048065 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 46.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 1.068089377671272e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 16100711.0, + "repeat_count": 1.0, + "routers_loss": 0.0015245937975123525, + "skip_count": 0.0, + "step": 9982, + "text_loss": 0.6802405714988708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 1.061735403314429e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 16103952.0, + "repeat_count": 0.0, + "routers_loss": 0.002281307242810726, + "skip_count": 1.0, + "step": 9984, + "text_loss": 0.3086298406124115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 1.055400182192906e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 16107101.0, + "repeat_count": 0.0, + "routers_loss": 0.0007910717977210879, + "skip_count": 0.0, + "step": 9986, + "text_loss": 0.7036139965057373 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 1.0490837167343559e-05, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 16110316.0, + "repeat_count": 1.0, + "routers_loss": 0.0030006880406290293, + "skip_count": 1.0, + "step": 9988, + "text_loss": 0.4638058841228485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 1.04278600935927e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16113206.0, + "repeat_count": 0.0, + "routers_loss": 0.0006434856331907213, + "skip_count": 0.0, + "step": 9990, + "text_loss": 0.6155068874359131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 1.0365070624809403e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16116098.0, + "repeat_count": 0.0, + "routers_loss": 0.0007891099085099995, + "skip_count": 0.0, + "step": 9992, + "text_loss": 0.4537872076034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 46.92045788083358, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 1.0302468785054641e-05, + "loss": 0.0054, + "macro_f1": 0.8823530077934265, + "num_tokens": 16119344.0, + "repeat_count": 2.0, + "routers_loss": 0.011918486095964909, + "skip_count": 1.0, + "step": 9994, + "text_loss": 0.18828579783439636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 1.0240054598317672e-05, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 16122615.0, + "repeat_count": 1.0, + "routers_loss": 0.016306765377521515, + "skip_count": 2.0, + "step": 9996, + "text_loss": 0.2876183092594147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 46.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 1.0177828088515694e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 16125506.0, + "repeat_count": 0.0, + "routers_loss": 0.00393108231946826, + "skip_count": 1.0, + "step": 9998, + "text_loss": 0.6387818455696106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 1.011578927949397e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16128499.0, + "repeat_count": 0.0, + "routers_loss": 0.001175055862404406, + "skip_count": 0.0, + "step": 10000, + "text_loss": 0.4085952639579773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 1.0053938195025925e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16130888.0, + "repeat_count": 0.0, + "routers_loss": 0.0029882853850722313, + "skip_count": 0.0, + "step": 10002, + "text_loss": 0.36795294284820557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 9.992274858812988e-06, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 16133875.0, + "repeat_count": 0.0, + "routers_loss": 0.0064101857133209705, + "skip_count": 2.0, + "step": 10004, + "text_loss": 0.30780166387557983 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 46.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 9.930799294484704e-06, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 16136826.0, + "repeat_count": 2.0, + "routers_loss": 0.004496502690017223, + "skip_count": 0.0, + "step": 10006, + "text_loss": 0.321386456489563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 46.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 9.869511525598617e-06, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 16140429.0, + "repeat_count": 0.0, + "routers_loss": 0.007862923666834831, + "skip_count": 2.0, + "step": 10008, + "text_loss": 0.3304281234741211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 46.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 9.80841157564033e-06, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 16143280.0, + "repeat_count": 0.0, + "routers_loss": 0.0007891185232438147, + "skip_count": 0.0, + "step": 10010, + "text_loss": 0.6880549788475037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 9.747499468023391e-06, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 16146124.0, + "repeat_count": 0.0, + "routers_loss": 0.00044356059515848756, + "skip_count": 0.0, + "step": 10012, + "text_loss": 0.7140262126922607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 9.686775226089462e-06, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 16148732.0, + "repeat_count": 0.0, + "routers_loss": 0.003097282024100423, + "skip_count": 0.0, + "step": 10014, + "text_loss": 0.5629494190216064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 9.626238873108262e-06, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 16151364.0, + "repeat_count": 0.0, + "routers_loss": 0.006588284857571125, + "skip_count": 1.0, + "step": 10016, + "text_loss": 0.20520731806755066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 9.565890432277346e-06, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 16154526.0, + "repeat_count": 0.0, + "routers_loss": 0.000600519881118089, + "skip_count": 0.0, + "step": 10018, + "text_loss": 0.428753525018692 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 9.50572992672233e-06, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 16158182.0, + "repeat_count": 1.0, + "routers_loss": 0.003753028344362974, + "skip_count": 0.0, + "step": 10020, + "text_loss": 0.4269808232784271 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.05165835045494, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 9.445757379496933e-06, + "loss": 0.0055, + "macro_f1": 0.8817967176437378, + "num_tokens": 16161691.0, + "repeat_count": 2.0, + "routers_loss": 0.02429025247693062, + "skip_count": 3.0, + "step": 10022, + "text_loss": 0.26357248425483704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 47.061050777810394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04296875, + "learning_rate": 9.385972813582721e-06, + "loss": 0.0056, + "macro_f1": 0.6122449040412903, + "num_tokens": 16164862.0, + "repeat_count": 0.0, + "routers_loss": 0.021486395969986916, + "skip_count": 4.0, + "step": 10024, + "text_loss": 0.4035261273384094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 47.07044320516584, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 9.326376251889201e-06, + "loss": 0.0059, + "macro_f1": 0.6601307392120361, + "num_tokens": 16169410.0, + "repeat_count": 2.0, + "routers_loss": 0.017894137650728226, + "skip_count": 1.0, + "step": 10026, + "text_loss": 0.5168870091438293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 9.266967717253938e-06, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16172430.0, + "repeat_count": 0.0, + "routers_loss": 0.0033336186315864325, + "skip_count": 0.0, + "step": 10028, + "text_loss": 0.5204904079437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 9.207747232442331e-06, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 16175797.0, + "repeat_count": 0.0, + "routers_loss": 0.0022511237766593695, + "skip_count": 0.0, + "step": 10030, + "text_loss": 0.19971035420894623 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.09862048723217, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 9.148714820147841e-06, + "loss": 0.0082, + "macro_f1": 0.6603773832321167, + "num_tokens": 16178636.0, + "repeat_count": 1.0, + "routers_loss": 0.03046531230211258, + "skip_count": 1.0, + "step": 10032, + "text_loss": 0.7068908214569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 9.089870502991815e-06, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 16182658.0, + "repeat_count": 0.0, + "routers_loss": 0.0013325439067557454, + "skip_count": 1.0, + "step": 10034, + "text_loss": 0.5161240100860596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 9.031214303523493e-06, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 16186669.0, + "repeat_count": 0.0, + "routers_loss": 0.0041415193118155, + "skip_count": 0.0, + "step": 10036, + "text_loss": 0.17281492054462433 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 8.972746244219953e-06, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 16189676.0, + "repeat_count": 1.0, + "routers_loss": 0.00235518510453403, + "skip_count": 0.0, + "step": 10038, + "text_loss": 0.776432991027832 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 47.13619019665395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 8.914466347486382e-06, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 16193068.0, + "repeat_count": 3.0, + "routers_loss": 0.020981203764677048, + "skip_count": 1.0, + "step": 10040, + "text_loss": 0.6855355501174927 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 47.14558262400939, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.046875, + "learning_rate": 8.856374635655695e-06, + "loss": 0.006, + "macro_f1": 0.9555556178092957, + "num_tokens": 16195878.0, + "repeat_count": 1.0, + "routers_loss": 0.017154231667518616, + "skip_count": 5.0, + "step": 10042, + "text_loss": 0.7087341547012329 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 8.798471130988695e-06, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 16198502.0, + "repeat_count": 1.0, + "routers_loss": 0.0036271605640649796, + "skip_count": 1.0, + "step": 10044, + "text_loss": 0.6096780300140381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 47.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 8.740755855674243e-06, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16201043.0, + "repeat_count": 0.0, + "routers_loss": 0.00554735166952014, + "skip_count": 3.0, + "step": 10046, + "text_loss": 0.4441182613372803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 8.683228831828816e-06, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 16204332.0, + "repeat_count": 0.0, + "routers_loss": 0.0031374485697597265, + "skip_count": 2.0, + "step": 10048, + "text_loss": 0.7983347773551941 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.625890081497001e-06, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 16207800.0, + "repeat_count": 0.0, + "routers_loss": 0.00201304629445076, + "skip_count": 0.0, + "step": 10050, + "text_loss": 0.34401828050613403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 8.568739626651002e-06, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 16210826.0, + "repeat_count": 0.0, + "routers_loss": 0.0021288148127496243, + "skip_count": 0.0, + "step": 10052, + "text_loss": 0.27440160512924194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 8.51177748919102e-06, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 16213643.0, + "repeat_count": 1.0, + "routers_loss": 0.002644419437274337, + "skip_count": 0.0, + "step": 10054, + "text_loss": 0.33396100997924805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 47.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04296875, + "learning_rate": 8.45500369094504e-06, + "loss": 0.0052, + "macro_f1": 0.9452888369560242, + "num_tokens": 16216646.0, + "repeat_count": 1.0, + "routers_loss": 0.048469074070453644, + "skip_count": 4.0, + "step": 10056, + "text_loss": 0.3018307089805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016845703125, + "learning_rate": 8.398418253668937e-06, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 16219499.0, + "repeat_count": 0.0, + "routers_loss": 0.0013763440074399114, + "skip_count": 0.0, + "step": 10058, + "text_loss": 0.39421531558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 8.342021199046312e-06, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16223062.0, + "repeat_count": 0.0, + "routers_loss": 0.004151828121393919, + "skip_count": 1.0, + "step": 10060, + "text_loss": 0.16675396263599396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 8.285812548688654e-06, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 16226201.0, + "repeat_count": 0.0, + "routers_loss": 0.003218848491087556, + "skip_count": 1.0, + "step": 10062, + "text_loss": 0.6134784817695618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 8.229792324135177e-06, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16229230.0, + "repeat_count": 0.0, + "routers_loss": 0.0058194492012262344, + "skip_count": 2.0, + "step": 10064, + "text_loss": 0.19825725257396698 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 47.25829175227473, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 8.173960546852987e-06, + "loss": 0.0073, + "macro_f1": 0.8814815282821655, + "num_tokens": 16232222.0, + "repeat_count": 2.0, + "routers_loss": 0.03991774469614029, + "skip_count": 4.0, + "step": 10066, + "text_loss": 0.2851788103580475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 8.11831723823686e-06, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 16236960.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416112538427114, + "skip_count": 0.0, + "step": 10068, + "text_loss": 0.32021182775497437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.27707660698562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 8.062862419609519e-06, + "loss": 0.0073, + "macro_f1": 0.3272727429866791, + "num_tokens": 16240419.0, + "repeat_count": 1.0, + "routers_loss": 0.015871701762080193, + "skip_count": 0.0, + "step": 10070, + "text_loss": 0.21992693841457367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 8.007596112221293e-06, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 16243273.0, + "repeat_count": 1.0, + "routers_loss": 0.004018099047243595, + "skip_count": 1.0, + "step": 10072, + "text_loss": 0.4440346658229828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 7.952518337250303e-06, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 16247268.0, + "repeat_count": 0.0, + "routers_loss": 0.004422081634402275, + "skip_count": 0.0, + "step": 10074, + "text_loss": 0.3484672009944916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 7.897629115802551e-06, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 16250590.0, + "repeat_count": 0.0, + "routers_loss": 0.003315444104373455, + "skip_count": 0.0, + "step": 10076, + "text_loss": 0.32249578833580017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 7.842928468911603e-06, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16253605.0, + "repeat_count": 0.0, + "routers_loss": 0.002227222314104438, + "skip_count": 0.0, + "step": 10078, + "text_loss": 0.4467211961746216 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 7.788416417538857e-06, + "loss": 0.0117, + "macro_f1": 1.0, + "num_tokens": 16256521.0, + "repeat_count": 1.0, + "routers_loss": 0.010048549622297287, + "skip_count": 3.0, + "step": 10080, + "text_loss": 0.29726436734199524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 7.734092982573493e-06, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 16259721.0, + "repeat_count": 1.0, + "routers_loss": 0.0012925490736961365, + "skip_count": 1.0, + "step": 10082, + "text_loss": 0.45976048707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 7.679958184832302e-06, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16262741.0, + "repeat_count": 0.0, + "routers_loss": 0.006292753387242556, + "skip_count": 0.0, + "step": 10084, + "text_loss": 0.32511985301971436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 7.626012045059916e-06, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 16266080.0, + "repeat_count": 0.0, + "routers_loss": 0.005420933943241835, + "skip_count": 2.0, + "step": 10086, + "text_loss": 0.20795102417469025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 7.572254583928406e-06, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16269733.0, + "repeat_count": 0.0, + "routers_loss": 0.003182400716468692, + "skip_count": 2.0, + "step": 10088, + "text_loss": 0.13773657381534576 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 7.5186858220379625e-06, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 16273812.0, + "repeat_count": 1.0, + "routers_loss": 0.008067524060606956, + "skip_count": 2.0, + "step": 10090, + "text_loss": 0.26591432094573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 7.4653057799161096e-06, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 16276550.0, + "repeat_count": 0.0, + "routers_loss": 0.0017690346576273441, + "skip_count": 0.0, + "step": 10092, + "text_loss": 0.6460638642311096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 7.412114478018261e-06, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 16280012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009686960838735104, + "skip_count": 0.0, + "step": 10094, + "text_loss": 0.5548131465911865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 7.359111936727281e-06, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 16282986.0, + "repeat_count": 0.0, + "routers_loss": 0.003071374725550413, + "skip_count": 2.0, + "step": 10096, + "text_loss": 0.09838774055242538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 7.306298176354032e-06, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 16285647.0, + "repeat_count": 0.0, + "routers_loss": 0.0028131429571658373, + "skip_count": 0.0, + "step": 10098, + "text_loss": 0.15995968878269196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 7.253673217136658e-06, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 16289663.0, + "repeat_count": 0.0, + "routers_loss": 0.003445233218371868, + "skip_count": 0.0, + "step": 10100, + "text_loss": 0.2618424892425537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 7.201237079241252e-06, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 16293270.0, + "repeat_count": 0.0, + "routers_loss": 0.006494173314422369, + "skip_count": 0.0, + "step": 10102, + "text_loss": 0.26529571413993835 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 7.1489897827614615e-06, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 16296633.0, + "repeat_count": 1.0, + "routers_loss": 0.0019948924891650677, + "skip_count": 0.0, + "step": 10104, + "text_loss": 0.774922251701355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 7.096931347718494e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16299679.0, + "repeat_count": 0.0, + "routers_loss": 0.0020289800595492125, + "skip_count": 1.0, + "step": 10106, + "text_loss": 0.715824544429779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 7.04506179406128e-06, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 16303207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691093143075705, + "skip_count": 0.0, + "step": 10108, + "text_loss": 0.4474022090435028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 6.993381141666255e-06, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 16306842.0, + "repeat_count": 0.0, + "routers_loss": 0.004444579128175974, + "skip_count": 0.0, + "step": 10110, + "text_loss": 0.3689751625061035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 6.9418894103376315e-06, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 16310919.0, + "repeat_count": 0.0, + "routers_loss": 0.0007234106888063252, + "skip_count": 0.0, + "step": 10112, + "text_loss": 0.7767618298530579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 6.890586619807126e-06, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 16313780.0, + "repeat_count": 0.0, + "routers_loss": 0.0017169835045933723, + "skip_count": 1.0, + "step": 10114, + "text_loss": 0.4885733127593994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 6.839472789733958e-06, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 16317529.0, + "repeat_count": 4.0, + "routers_loss": 0.007271626964211464, + "skip_count": 5.0, + "step": 10116, + "text_loss": 0.6611388921737671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 6.788547939705181e-06, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16320105.0, + "repeat_count": 0.0, + "routers_loss": 0.0022054670844227076, + "skip_count": 0.0, + "step": 10118, + "text_loss": 0.18132901191711426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016845703125, + "learning_rate": 6.737812089235185e-06, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 16323080.0, + "repeat_count": 0.0, + "routers_loss": 0.0006391640636138618, + "skip_count": 0.0, + "step": 10120, + "text_loss": 0.32267218828201294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 6.68726525776614e-06, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 16326006.0, + "repeat_count": 0.0, + "routers_loss": 0.00046651664888486266, + "skip_count": 0.0, + "step": 10122, + "text_loss": 0.4213443100452423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 6.6369074646676635e-06, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 16329108.0, + "repeat_count": 0.0, + "routers_loss": 0.0025715050287544727, + "skip_count": 2.0, + "step": 10124, + "text_loss": 0.48734065890312195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 6.58673872923693e-06, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16332106.0, + "repeat_count": 0.0, + "routers_loss": 0.001850960310548544, + "skip_count": 0.0, + "step": 10126, + "text_loss": 1.0562689304351807 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 47.549457000293515, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 6.536759070698672e-06, + "loss": 0.0045, + "macro_f1": 0.8814815282821655, + "num_tokens": 16334960.0, + "repeat_count": 2.0, + "routers_loss": 0.014950240030884743, + "skip_count": 4.0, + "step": 10128, + "text_loss": 0.8084779381752014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 6.486968508205237e-06, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 16338086.0, + "repeat_count": 0.0, + "routers_loss": 0.0018889640923589468, + "skip_count": 0.0, + "step": 10130, + "text_loss": 0.5870251059532166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 6.437367060836419e-06, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 16341036.0, + "repeat_count": 0.0, + "routers_loss": 0.001758610364049673, + "skip_count": 0.0, + "step": 10132, + "text_loss": 0.46824970841407776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 47.577634282359845, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.026611328125, + "learning_rate": 6.387954747599622e-06, + "loss": 0.0067, + "macro_f1": 0.5934640765190125, + "num_tokens": 16344236.0, + "repeat_count": 0.0, + "routers_loss": 0.013333287090063095, + "skip_count": 3.0, + "step": 10134, + "text_loss": 0.28457126021385193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 6.338731587429758e-06, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 16348312.0, + "repeat_count": 0.0, + "routers_loss": 0.003430357202887535, + "skip_count": 0.0, + "step": 10136, + "text_loss": 0.2896702289581299 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 6.289697599189181e-06, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 16351044.0, + "repeat_count": 0.0, + "routers_loss": 0.001170355360955, + "skip_count": 0.0, + "step": 10138, + "text_loss": 0.6347740292549133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 6.240852801667752e-06, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 16354443.0, + "repeat_count": 0.0, + "routers_loss": 0.0033906467724591494, + "skip_count": 2.0, + "step": 10140, + "text_loss": 0.5276535749435425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 6.192197213583051e-06, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 16357435.0, + "repeat_count": 0.0, + "routers_loss": 0.001492051873356104, + "skip_count": 2.0, + "step": 10142, + "text_loss": 0.49688321352005005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 6.143730853579887e-06, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 16360993.0, + "repeat_count": 1.0, + "routers_loss": 0.0024281898513436317, + "skip_count": 1.0, + "step": 10144, + "text_loss": 0.49487611651420593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 47.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 6.095453740230683e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16364196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147443782538176, + "skip_count": 5.0, + "step": 10146, + "text_loss": 0.3056519329547882 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.047365892035361e-06, + "loss": 0.005, + "macro_f1": 0.5492662787437439, + "num_tokens": 16368152.0, + "repeat_count": 2.0, + "routers_loss": 0.015886440873146057, + "skip_count": 0.0, + "step": 10148, + "text_loss": 0.6246888637542725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 5.999467327421182e-06, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 16371538.0, + "repeat_count": 0.0, + "routers_loss": 0.004134364426136017, + "skip_count": 0.0, + "step": 10150, + "text_loss": 0.38278883695602417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 5.951758064743018e-06, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 16374324.0, + "repeat_count": 0.0, + "routers_loss": 0.0050895679742097855, + "skip_count": 2.0, + "step": 10152, + "text_loss": 0.7034569978713989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 5.9042381222831345e-06, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 16377386.0, + "repeat_count": 0.0, + "routers_loss": 0.0024351398460566998, + "skip_count": 0.0, + "step": 10154, + "text_loss": 0.5222152471542358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05029296875, + "learning_rate": 5.856907518251298e-06, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 16380449.0, + "repeat_count": 0.0, + "routers_loss": 0.010605348274111748, + "skip_count": 2.0, + "step": 10156, + "text_loss": 0.3262309730052948 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 47.69034341062518, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0289306640625, + "learning_rate": 5.8097662707846664e-06, + "loss": 0.0062, + "macro_f1": 0.9262410998344421, + "num_tokens": 16383519.0, + "repeat_count": 2.0, + "routers_loss": 0.022603167220950127, + "skip_count": 3.0, + "step": 10158, + "text_loss": 0.28901928663253784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 5.7628143979478465e-06, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 16386345.0, + "repeat_count": 0.0, + "routers_loss": 0.004141980782151222, + "skip_count": 0.0, + "step": 10160, + "text_loss": 0.2058449685573578 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 5.7160519177328344e-06, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 16389766.0, + "repeat_count": 1.0, + "routers_loss": 0.004226053133606911, + "skip_count": 3.0, + "step": 10162, + "text_loss": 0.5554977655410767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 5.66947884805924e-06, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 16392323.0, + "repeat_count": 0.0, + "routers_loss": 0.0036407415755093098, + "skip_count": 1.0, + "step": 10164, + "text_loss": 0.43077412247657776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 5.623095206773788e-06, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 16395555.0, + "repeat_count": 0.0, + "routers_loss": 0.0020233208779245615, + "skip_count": 0.0, + "step": 10166, + "text_loss": 0.654839813709259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 5.57690101165087e-06, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 16398949.0, + "repeat_count": 1.0, + "routers_loss": 0.006491049658507109, + "skip_count": 2.0, + "step": 10168, + "text_loss": 0.2042955756187439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 5.530896280392217e-06, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 16402654.0, + "repeat_count": 0.0, + "routers_loss": 0.0032798724714666605, + "skip_count": 1.0, + "step": 10170, + "text_loss": 0.303030401468277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 5.485081030626838e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16405701.0, + "repeat_count": 0.0, + "routers_loss": 0.0010711143258959055, + "skip_count": 2.0, + "step": 10172, + "text_loss": 0.3775373101234436 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 5.4394552799112985e-06, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 16408999.0, + "repeat_count": 1.0, + "routers_loss": 0.0038391631096601486, + "skip_count": 1.0, + "step": 10174, + "text_loss": 0.20590868592262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 5.394019045729448e-06, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16412045.0, + "repeat_count": 0.0, + "routers_loss": 0.0016695939702913165, + "skip_count": 0.0, + "step": 10176, + "text_loss": 0.5118611454963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 5.348772345492525e-06, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 16415107.0, + "repeat_count": 0.0, + "routers_loss": 0.0007850619731470942, + "skip_count": 0.0, + "step": 10178, + "text_loss": 0.6818836331367493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 5.30371519653916e-06, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 16418012.0, + "repeat_count": 0.0, + "routers_loss": 0.0039045598823577166, + "skip_count": 0.0, + "step": 10180, + "text_loss": 0.5973153710365295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 5.258847616135376e-06, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 16420715.0, + "repeat_count": 0.0, + "routers_loss": 0.0035636175889521837, + "skip_count": 2.0, + "step": 10182, + "text_loss": 0.5864625573158264 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 5.214169621474419e-06, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 16423561.0, + "repeat_count": 1.0, + "routers_loss": 0.0038354399148374796, + "skip_count": 3.0, + "step": 10184, + "text_loss": 0.6486931443214417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 5.169681229677037e-06, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 16426737.0, + "repeat_count": 0.0, + "routers_loss": 0.0029660905711352825, + "skip_count": 0.0, + "step": 10186, + "text_loss": 0.32970958948135376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017333984375, + "learning_rate": 5.125382457791316e-06, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16429810.0, + "repeat_count": 0.0, + "routers_loss": 0.0022391141392290592, + "skip_count": 0.0, + "step": 10188, + "text_loss": 0.5421582460403442 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 5.081273322792512e-06, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 16433324.0, + "repeat_count": 1.0, + "routers_loss": 0.009630356915295124, + "skip_count": 2.0, + "step": 10190, + "text_loss": 0.29760071635246277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 5.037353841583436e-06, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 16436466.0, + "repeat_count": 0.0, + "routers_loss": 0.005291678477078676, + "skip_count": 0.0, + "step": 10192, + "text_loss": 0.31106626987457275 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.85940710302319, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 4.99362403099407e-06, + "loss": 0.0065, + "macro_f1": 0.8823530077934265, + "num_tokens": 16439152.0, + "repeat_count": 2.0, + "routers_loss": 0.01734933815896511, + "skip_count": 1.0, + "step": 10194, + "text_loss": 0.575576901435852 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 47.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 4.950083907781733e-06, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 16442387.0, + "repeat_count": 2.0, + "routers_loss": 0.011718297377228737, + "skip_count": 2.0, + "step": 10196, + "text_loss": 0.19005915522575378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 4.906733488631187e-06, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 16445391.0, + "repeat_count": 0.0, + "routers_loss": 0.0015040510334074497, + "skip_count": 0.0, + "step": 10198, + "text_loss": 0.6865255236625671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 47.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 4.863572790154258e-06, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 16449229.0, + "repeat_count": 1.0, + "routers_loss": 0.001746732392348349, + "skip_count": 0.0, + "step": 10200, + "text_loss": 0.4538392722606659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 4.82060182889027e-06, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 16452590.0, + "repeat_count": 0.0, + "routers_loss": 0.0009971166728064418, + "skip_count": 0.0, + "step": 10202, + "text_loss": 0.7585988640785217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 4.777820621305828e-06, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16455458.0, + "repeat_count": 0.0, + "routers_loss": 0.005328746512532234, + "skip_count": 2.0, + "step": 10204, + "text_loss": 0.558459460735321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 4.735229183794709e-06, + "loss": 0.0056, + "macro_f1": 0.6601307392120361, + "num_tokens": 16458749.0, + "repeat_count": 1.0, + "routers_loss": 0.04486622288823128, + "skip_count": 2.0, + "step": 10206, + "text_loss": 0.15466898679733276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 4.692827532678023e-06, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 16461257.0, + "repeat_count": 0.0, + "routers_loss": 0.005598196294158697, + "skip_count": 1.0, + "step": 10208, + "text_loss": 0.1840037852525711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0172119140625, + "learning_rate": 4.650615684204163e-06, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16465271.0, + "repeat_count": 0.0, + "routers_loss": 0.0015303631080314517, + "skip_count": 0.0, + "step": 10210, + "text_loss": 0.45189639925956726 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 47.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 4.608593654548854e-06, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 16468804.0, + "repeat_count": 1.0, + "routers_loss": 0.015223458409309387, + "skip_count": 5.0, + "step": 10212, + "text_loss": 0.34667012095451355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 4.566761459814939e-06, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 16471953.0, + "repeat_count": 0.0, + "routers_loss": 0.004154558759182692, + "skip_count": 0.0, + "step": 10214, + "text_loss": 0.19757303595542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 47.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 4.52511911603265e-06, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16475292.0, + "repeat_count": 0.0, + "routers_loss": 0.002932488452643156, + "skip_count": 1.0, + "step": 10216, + "text_loss": 0.4767858684062958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 4.483666639159389e-06, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 16478361.0, + "repeat_count": 1.0, + "routers_loss": 0.009086701087653637, + "skip_count": 2.0, + "step": 10218, + "text_loss": 0.3097109794616699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 47.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 4.442404045079784e-06, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 16481058.0, + "repeat_count": 0.0, + "routers_loss": 0.007684580981731415, + "skip_count": 2.0, + "step": 10220, + "text_loss": 0.4293085038661957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 47.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 4.401331349605797e-06, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 16484933.0, + "repeat_count": 0.0, + "routers_loss": 0.004087725654244423, + "skip_count": 0.0, + "step": 10222, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 48.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 4.360448568476561e-06, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 16488096.0, + "repeat_count": 2.0, + "routers_loss": 0.003739884588867426, + "skip_count": 1.0, + "step": 10224, + "text_loss": 0.5812314748764038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 48.00939242735544, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0234375, + "learning_rate": 4.319755717358431e-06, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 16490918.0, + "repeat_count": 0.0, + "routers_loss": 0.01758190058171749, + "skip_count": 2.0, + "step": 10226, + "text_loss": 0.35358762741088867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 4.2792528118449356e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16493996.0, + "repeat_count": 0.0, + "routers_loss": 0.003696965519338846, + "skip_count": 1.0, + "step": 10228, + "text_loss": 0.28963083028793335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 4.238939867456937e-06, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 16498163.0, + "repeat_count": 0.0, + "routers_loss": 0.0009691051091067493, + "skip_count": 0.0, + "step": 10230, + "text_loss": 0.6794275045394897 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 48.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 4.198816899642355e-06, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 16501075.0, + "repeat_count": 2.0, + "routers_loss": 0.00915334839373827, + "skip_count": 0.0, + "step": 10232, + "text_loss": 0.6993107795715332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.046962136777225, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 4.158883923776447e-06, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 16504533.0, + "repeat_count": 0.0, + "routers_loss": 0.010835417546331882, + "skip_count": 1.0, + "step": 10234, + "text_loss": 0.46092382073402405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 48.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 4.119140955161582e-06, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 16507612.0, + "repeat_count": 1.0, + "routers_loss": 0.006133808754384518, + "skip_count": 7.0, + "step": 10236, + "text_loss": 0.5992426872253418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 4.079588009027357e-06, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 16510623.0, + "repeat_count": 0.0, + "routers_loss": 0.001170355360955, + "skip_count": 0.0, + "step": 10238, + "text_loss": 0.6200118660926819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 4.040225100530536e-06, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 16513709.0, + "repeat_count": 0.0, + "routers_loss": 0.0013292148942127824, + "skip_count": 0.0, + "step": 10240, + "text_loss": 0.41305387020111084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 4.001052244754999e-06, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 16516793.0, + "repeat_count": 0.0, + "routers_loss": 0.003694178769364953, + "skip_count": 0.0, + "step": 10242, + "text_loss": 0.36737722158432007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 3.962069456711903e-06, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 16519837.0, + "repeat_count": 0.0, + "routers_loss": 0.004149764310568571, + "skip_count": 2.0, + "step": 10244, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 3.9232767513395215e-06, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 16523034.0, + "repeat_count": 0.0, + "routers_loss": 0.005588968750089407, + "skip_count": 1.0, + "step": 10246, + "text_loss": 0.22806818783283234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 48.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 3.884674143503353e-06, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 16525916.0, + "repeat_count": 1.0, + "routers_loss": 0.0011802187655121088, + "skip_count": 1.0, + "step": 10248, + "text_loss": 0.36658138036727905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.846261647995897e-06, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 16529040.0, + "repeat_count": 2.0, + "routers_loss": 0.010508419014513493, + "skip_count": 4.0, + "step": 10250, + "text_loss": 0.20360486209392548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 3.8080392795369347e-06, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 16532088.0, + "repeat_count": 0.0, + "routers_loss": 0.0004971205489709973, + "skip_count": 1.0, + "step": 10252, + "text_loss": 0.5355691313743591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 3.770007052773361e-06, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 16534996.0, + "repeat_count": 0.0, + "routers_loss": 0.003430357202887535, + "skip_count": 0.0, + "step": 10254, + "text_loss": 0.2113809734582901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 3.732164982279185e-06, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16538077.0, + "repeat_count": 0.0, + "routers_loss": 0.0017093889182433486, + "skip_count": 0.0, + "step": 10256, + "text_loss": 0.8436145782470703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 3.6945130825555284e-06, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 16541341.0, + "repeat_count": 0.0, + "routers_loss": 0.0028951778076589108, + "skip_count": 0.0, + "step": 10258, + "text_loss": 0.6505146026611328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 3.6570513680307395e-06, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 16544277.0, + "repeat_count": 0.0, + "routers_loss": 0.007293363101780415, + "skip_count": 1.0, + "step": 10260, + "text_loss": 0.35743454098701477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 3.6197798530601124e-06, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 16547527.0, + "repeat_count": 0.0, + "routers_loss": 0.00111240369733423, + "skip_count": 0.0, + "step": 10262, + "text_loss": 0.7323034405708313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 48.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 3.582698551926278e-06, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 16550798.0, + "repeat_count": 3.0, + "routers_loss": 0.005441631190478802, + "skip_count": 3.0, + "step": 10264, + "text_loss": 0.2366604059934616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 3.5458074788387585e-06, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 16553653.0, + "repeat_count": 0.0, + "routers_loss": 0.0033211156260222197, + "skip_count": 2.0, + "step": 10266, + "text_loss": 0.17687638103961945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 3.5091066479344125e-06, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 16556646.0, + "repeat_count": 0.0, + "routers_loss": 0.0005611624801531434, + "skip_count": 0.0, + "step": 10268, + "text_loss": 0.5710030198097229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.21602582917523, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 3.4725960732769345e-06, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 16560057.0, + "repeat_count": 1.0, + "routers_loss": 0.025627652183175087, + "skip_count": 1.0, + "step": 10270, + "text_loss": 0.45811519026756287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 3.4362757688573555e-06, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 16562955.0, + "repeat_count": 0.0, + "routers_loss": 0.0016429986571893096, + "skip_count": 0.0, + "step": 10272, + "text_loss": 0.6733152866363525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 3.4001457485935416e-06, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 16565900.0, + "repeat_count": 0.0, + "routers_loss": 0.002614749362692237, + "skip_count": 0.0, + "step": 10274, + "text_loss": 0.659094512462616 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 3.3642060263307515e-06, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 16568986.0, + "repeat_count": 1.0, + "routers_loss": 0.0007164402049966156, + "skip_count": 0.0, + "step": 10276, + "text_loss": 0.687470018863678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 3.3284566158410244e-06, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 16571680.0, + "repeat_count": 1.0, + "routers_loss": 0.0013704646844416857, + "skip_count": 0.0, + "step": 10278, + "text_loss": 0.6212679743766785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 3.29289753082368e-06, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 16575388.0, + "repeat_count": 0.0, + "routers_loss": 0.004897230304777622, + "skip_count": 2.0, + "step": 10280, + "text_loss": 0.2466924786567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 3.2575287849050394e-06, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 16578717.0, + "repeat_count": 0.0, + "routers_loss": 0.002897132420912385, + "skip_count": 0.0, + "step": 10282, + "text_loss": 0.3043138384819031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 3.2223503916383736e-06, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 16581703.0, + "repeat_count": 0.0, + "routers_loss": 0.0013213366037234664, + "skip_count": 0.0, + "step": 10284, + "text_loss": 0.4567781686782837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 3.187362364504176e-06, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 16584502.0, + "repeat_count": 0.0, + "routers_loss": 0.0033954589162021875, + "skip_count": 0.0, + "step": 10286, + "text_loss": 0.7037429809570312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 3.152564716909889e-06, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 16587621.0, + "repeat_count": 0.0, + "routers_loss": 0.0013146435376256704, + "skip_count": 0.0, + "step": 10288, + "text_loss": 0.681390643119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 3.1179574621901243e-06, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 16590566.0, + "repeat_count": 0.0, + "routers_loss": 0.013315175659954548, + "skip_count": 1.0, + "step": 10290, + "text_loss": 0.28952887654304504 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 48.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 3.0835406136063837e-06, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 16593642.0, + "repeat_count": 1.0, + "routers_loss": 0.010560612194240093, + "skip_count": 1.0, + "step": 10292, + "text_loss": 0.19317017495632172 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 3.0493141843472293e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16596938.0, + "repeat_count": 1.0, + "routers_loss": 0.00572188850492239, + "skip_count": 0.0, + "step": 10294, + "text_loss": 0.2277865707874298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 3.0152781875283918e-06, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 16599989.0, + "repeat_count": 0.0, + "routers_loss": 0.002278512343764305, + "skip_count": 2.0, + "step": 10296, + "text_loss": 0.6504809260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 2.981432636192438e-06, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 16603008.0, + "repeat_count": 0.0, + "routers_loss": 0.00433303089812398, + "skip_count": 1.0, + "step": 10298, + "text_loss": 0.4959591031074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 2.9477775433091047e-06, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 16606208.0, + "repeat_count": 0.0, + "routers_loss": 0.00256242579780519, + "skip_count": 0.0, + "step": 10300, + "text_loss": 0.68474280834198 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 48.36630466686234, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03125, + "learning_rate": 2.91431292177502e-06, + "loss": 0.0055, + "macro_f1": 0.8823530077934265, + "num_tokens": 16610278.0, + "repeat_count": 1.0, + "routers_loss": 0.019528929144144058, + "skip_count": 2.0, + "step": 10302, + "text_loss": 0.5476719737052917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 2.8810387844139807e-06, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 16613922.0, + "repeat_count": 0.0, + "routers_loss": 0.003266195533797145, + "skip_count": 2.0, + "step": 10304, + "text_loss": 0.24820174276828766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 2.8479551439766215e-06, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 16617359.0, + "repeat_count": 0.0, + "routers_loss": 0.007189559284597635, + "skip_count": 2.0, + "step": 10306, + "text_loss": 0.5665034055709839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 2.8150620131407456e-06, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 16620207.0, + "repeat_count": 0.0, + "routers_loss": 0.0014458110090345144, + "skip_count": 1.0, + "step": 10308, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 48.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 2.782359404510937e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16624196.0, + "repeat_count": 0.0, + "routers_loss": 0.008068135008215904, + "skip_count": 3.0, + "step": 10310, + "text_loss": 0.22482043504714966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01495361328125, + "learning_rate": 2.7498473306190043e-06, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16627754.0, + "repeat_count": 0.0, + "routers_loss": 0.0026512884069234133, + "skip_count": 0.0, + "step": 10312, + "text_loss": 0.597885012626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 2.717525803923593e-06, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 16630642.0, + "repeat_count": 0.0, + "routers_loss": 0.0003541568876244128, + "skip_count": 0.0, + "step": 10314, + "text_loss": 0.5806127190589905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01495361328125, + "learning_rate": 2.685394836810351e-06, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 16634219.0, + "repeat_count": 1.0, + "routers_loss": 0.0009424841264262795, + "skip_count": 0.0, + "step": 10316, + "text_loss": 0.5818291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 2.653454441591985e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16637963.0, + "repeat_count": 0.0, + "routers_loss": 0.0026975939981639385, + "skip_count": 1.0, + "step": 10318, + "text_loss": 0.4503914713859558 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 2.6217046305080926e-06, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 16640931.0, + "repeat_count": 2.0, + "routers_loss": 0.009299893863499165, + "skip_count": 3.0, + "step": 10320, + "text_loss": 0.4027388393878937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 2.5901454157252204e-06, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 16644763.0, + "repeat_count": 1.0, + "routers_loss": 0.005888781510293484, + "skip_count": 3.0, + "step": 10322, + "text_loss": 0.3544044494628906 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 48.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 2.5587768093369713e-06, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 16647642.0, + "repeat_count": 2.0, + "routers_loss": 0.005154009442776442, + "skip_count": 1.0, + "step": 10324, + "text_loss": 0.5421624183654785 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.47901379512768, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 2.527598823363786e-06, + "loss": 0.0084, + "macro_f1": 0.9539539813995361, + "num_tokens": 16650930.0, + "repeat_count": 5.0, + "routers_loss": 0.05385079234838486, + "skip_count": 5.0, + "step": 10326, + "text_loss": 0.11125081777572632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 48.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 2.4966114697532185e-06, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 16655012.0, + "repeat_count": 0.0, + "routers_loss": 0.010021892376244068, + "skip_count": 4.0, + "step": 10328, + "text_loss": 0.6925008296966553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 48.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 2.4658147603796587e-06, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 16657811.0, + "repeat_count": 2.0, + "routers_loss": 0.00567783834412694, + "skip_count": 2.0, + "step": 10330, + "text_loss": 0.25555673241615295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 2.4352087070443895e-06, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 16661346.0, + "repeat_count": 0.0, + "routers_loss": 0.0003201630897819996, + "skip_count": 0.0, + "step": 10332, + "text_loss": 0.41918623447418213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0164794921875, + "learning_rate": 2.404793321475751e-06, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 16664219.0, + "repeat_count": 0.0, + "routers_loss": 0.0063372207805514336, + "skip_count": 1.0, + "step": 10334, + "text_loss": 0.2512246072292328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 2.3745686153290315e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16667608.0, + "repeat_count": 0.0, + "routers_loss": 0.004200387746095657, + "skip_count": 1.0, + "step": 10336, + "text_loss": 0.27055928111076355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 2.344534600186299e-06, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 16670193.0, + "repeat_count": 0.0, + "routers_loss": 0.005420933943241835, + "skip_count": 2.0, + "step": 10338, + "text_loss": 0.19804859161376953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 2.314691287556736e-06, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 16673922.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731551434844732, + "skip_count": 2.0, + "step": 10340, + "text_loss": 0.7323333024978638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 48.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 2.2850386888763063e-06, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 16677264.0, + "repeat_count": 1.0, + "routers_loss": 0.004557183478027582, + "skip_count": 4.0, + "step": 10342, + "text_loss": 0.34720420837402344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 2.2555768155079203e-06, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 16680472.0, + "repeat_count": 0.0, + "routers_loss": 0.001710049225948751, + "skip_count": 0.0, + "step": 10344, + "text_loss": 0.5197516679763794 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.57293806868213, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 2.2263056787414916e-06, + "loss": 0.0038, + "macro_f1": 0.8820862174034119, + "num_tokens": 16684337.0, + "repeat_count": 2.0, + "routers_loss": 0.01958944834768772, + "skip_count": 2.0, + "step": 10346, + "text_loss": 0.3011045753955841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 2.197225289793714e-06, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 16687209.0, + "repeat_count": 0.0, + "routers_loss": 0.0017791363643482327, + "skip_count": 0.0, + "step": 10348, + "text_loss": 0.33468589186668396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 2.168335659808285e-06, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16690418.0, + "repeat_count": 0.0, + "routers_loss": 0.003162781707942486, + "skip_count": 1.0, + "step": 10350, + "text_loss": 0.6261059641838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 2.1396367998557375e-06, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 16693215.0, + "repeat_count": 0.0, + "routers_loss": 0.005255573894828558, + "skip_count": 0.0, + "step": 10352, + "text_loss": 0.40527454018592834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 2.1111287209335527e-06, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 16696668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033432829659432173, + "skip_count": 1.0, + "step": 10354, + "text_loss": 0.28645285964012146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 2.082811433966103e-06, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 16699985.0, + "repeat_count": 0.0, + "routers_loss": 0.0011671687243506312, + "skip_count": 0.0, + "step": 10356, + "text_loss": 0.746609628200531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 48.629292632814796, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0230712890625, + "learning_rate": 2.054684949804542e-06, + "loss": 0.0038, + "macro_f1": 0.5934640765190125, + "num_tokens": 16703300.0, + "repeat_count": 0.0, + "routers_loss": 0.01967054046690464, + "skip_count": 3.0, + "step": 10358, + "text_loss": 0.33314839005470276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.63868506017024, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 2.026749279227025e-06, + "loss": 0.0071, + "macro_f1": 0.6601307392120361, + "num_tokens": 16706630.0, + "repeat_count": 1.0, + "routers_loss": 0.053273484110832214, + "skip_count": 2.0, + "step": 10360, + "text_loss": 0.28726521134376526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 1.9990044329386004e-06, + "loss": 0.003, + "macro_f1": 0.3333333432674408, + "num_tokens": 16709809.0, + "repeat_count": 0.0, + "routers_loss": 0.0005693111452274024, + "skip_count": 0.0, + "step": 10362, + "text_loss": 0.4472726285457611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 1.9714504215711528e-06, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 16713089.0, + "repeat_count": 0.0, + "routers_loss": 0.008012447506189346, + "skip_count": 1.0, + "step": 10364, + "text_loss": 0.3002646863460541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 1.9440872556833466e-06, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 16715826.0, + "repeat_count": 0.0, + "routers_loss": 0.0015855961246415973, + "skip_count": 1.0, + "step": 10366, + "text_loss": 0.4461057484149933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 1.9169149457608504e-06, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 16719155.0, + "repeat_count": 1.0, + "routers_loss": 0.006313335616141558, + "skip_count": 2.0, + "step": 10368, + "text_loss": 0.4553263485431671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 1.889933502216168e-06, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 16722191.0, + "repeat_count": 0.0, + "routers_loss": 0.0025730058550834656, + "skip_count": 0.0, + "step": 10370, + "text_loss": 0.31290385127067566 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.8631429353885842e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16725460.0, + "repeat_count": 1.0, + "routers_loss": 0.005240040831267834, + "skip_count": 0.0, + "step": 10372, + "text_loss": 0.4621378779411316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 1.8365432555443318e-06, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 16728690.0, + "repeat_count": 0.0, + "routers_loss": 0.001968455035239458, + "skip_count": 1.0, + "step": 10374, + "text_loss": 0.5022224187850952 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 48.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 1.8101344728764234e-06, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 16733192.0, + "repeat_count": 2.0, + "routers_loss": 0.003168600145727396, + "skip_count": 2.0, + "step": 10376, + "text_loss": 0.4973319470882416 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.72321690636924, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 1.78391659750482e-06, + "loss": 0.0072, + "macro_f1": 0.8820862174034119, + "num_tokens": 16736219.0, + "repeat_count": 2.0, + "routers_loss": 0.04101128876209259, + "skip_count": 2.0, + "step": 10378, + "text_loss": 0.13770700991153717 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 1.7578896394762067e-06, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 16738840.0, + "repeat_count": 1.0, + "routers_loss": 0.0013390687527135015, + "skip_count": 0.0, + "step": 10380, + "text_loss": 0.8668286800384521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 48.74200176108013, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 1.7320536087641613e-06, + "loss": 0.0074, + "macro_f1": 0.6595745086669922, + "num_tokens": 16742145.0, + "repeat_count": 1.0, + "routers_loss": 0.04137809947133064, + "skip_count": 4.0, + "step": 10382, + "text_loss": 0.19390869140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 1.7064085152691534e-06, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 16745293.0, + "repeat_count": 0.0, + "routers_loss": 0.0013169923331588507, + "skip_count": 0.0, + "step": 10384, + "text_loss": 0.6248905658721924 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 48.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 1.6809543688183771e-06, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 16748438.0, + "repeat_count": 2.0, + "routers_loss": 0.005269711371511221, + "skip_count": 1.0, + "step": 10386, + "text_loss": 0.8555964827537537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 1.655691179165919e-06, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 16752155.0, + "repeat_count": 0.0, + "routers_loss": 0.005495068617165089, + "skip_count": 1.0, + "step": 10388, + "text_loss": 0.17478284239768982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 1.630618955992702e-06, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 16755072.0, + "repeat_count": 0.0, + "routers_loss": 0.0012614501174539328, + "skip_count": 0.0, + "step": 10390, + "text_loss": 0.4476284384727478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.605737708906374e-06, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 16759622.0, + "repeat_count": 0.0, + "routers_loss": 0.0016320135910063982, + "skip_count": 1.0, + "step": 10392, + "text_loss": 0.6159437894821167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 1.5810474474415858e-06, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16763114.0, + "repeat_count": 0.0, + "routers_loss": 0.00686453003436327, + "skip_count": 2.0, + "step": 10394, + "text_loss": 0.2532145082950592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 48.807748752568244, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 1.5565481810596582e-06, + "loss": 0.0052, + "macro_f1": 0.6598639488220215, + "num_tokens": 16765899.0, + "repeat_count": 1.0, + "routers_loss": 0.010446256957948208, + "skip_count": 3.0, + "step": 10396, + "text_loss": 0.3457704186439514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.81714117992369, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 1.5322399191487479e-06, + "loss": 0.006, + "macro_f1": 0.8817967176437378, + "num_tokens": 16769020.0, + "repeat_count": 2.0, + "routers_loss": 0.028018560260534286, + "skip_count": 3.0, + "step": 10398, + "text_loss": 0.2568260133266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 1.5081226710237927e-06, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 16771946.0, + "repeat_count": 0.0, + "routers_loss": 0.0017342215869575739, + "skip_count": 0.0, + "step": 10400, + "text_loss": 0.21244384348392487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 48.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 1.4841964459266221e-06, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 16775398.0, + "repeat_count": 0.0, + "routers_loss": 0.007773366756737232, + "skip_count": 3.0, + "step": 10402, + "text_loss": 0.2011307328939438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 1.4604612530257356e-06, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 16778556.0, + "repeat_count": 0.0, + "routers_loss": 0.0032797956373542547, + "skip_count": 1.0, + "step": 10404, + "text_loss": 0.2331003099679947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 1.4369171014165793e-06, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 16782298.0, + "repeat_count": 0.0, + "routers_loss": 0.000855600053910166, + "skip_count": 0.0, + "step": 10406, + "text_loss": 0.37070924043655396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 1.41356400012127e-06, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 16785157.0, + "repeat_count": 1.0, + "routers_loss": 0.0023640329018235207, + "skip_count": 0.0, + "step": 10408, + "text_loss": 0.2712402939796448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 1.390401958088816e-06, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 16787842.0, + "repeat_count": 0.0, + "routers_loss": 0.0042669083923101425, + "skip_count": 2.0, + "step": 10410, + "text_loss": 0.1989891678094864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 1.367430984194895e-06, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 16791063.0, + "repeat_count": 0.0, + "routers_loss": 0.010778319090604782, + "skip_count": 2.0, + "step": 10412, + "text_loss": 0.2656673491001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.89228059876724, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 1.3446510872420214e-06, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 16794468.0, + "repeat_count": 0.0, + "routers_loss": 0.014451594091951847, + "skip_count": 1.0, + "step": 10414, + "text_loss": 0.615280032157898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.3220622759596014e-06, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 16797197.0, + "repeat_count": 0.0, + "routers_loss": 0.002590922173112631, + "skip_count": 1.0, + "step": 10416, + "text_loss": 0.6224665641784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 1.2996645590035439e-06, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 16800435.0, + "repeat_count": 0.0, + "routers_loss": 0.0002690292603801936, + "skip_count": 0.0, + "step": 10418, + "text_loss": 0.5916928052902222 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 1.2774579449568723e-06, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16803488.0, + "repeat_count": 0.0, + "routers_loss": 0.005886071361601353, + "skip_count": 2.0, + "step": 10420, + "text_loss": 0.33671438694000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 48.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 1.2554424423290578e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16807339.0, + "repeat_count": 0.0, + "routers_loss": 0.0038914172910153866, + "skip_count": 2.0, + "step": 10422, + "text_loss": 0.11040981113910675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 1.2336180595565738e-06, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 16810409.0, + "repeat_count": 0.0, + "routers_loss": 0.001565443933941424, + "skip_count": 1.0, + "step": 10424, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.2119848050025083e-06, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 16813888.0, + "repeat_count": 0.0, + "routers_loss": 0.0023584216833114624, + "skip_count": 0.0, + "step": 10426, + "text_loss": 0.21560436487197876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 48.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 1.1905426869567859e-06, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16816678.0, + "repeat_count": 1.0, + "routers_loss": 0.004424029495567083, + "skip_count": 0.0, + "step": 10428, + "text_loss": 0.36319077014923096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 48.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 1.1692917136361115e-06, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 16819763.0, + "repeat_count": 2.0, + "routers_loss": 0.004053499549627304, + "skip_count": 1.0, + "step": 10430, + "text_loss": 0.6534333825111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 1.1482318931838043e-06, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 16823415.0, + "repeat_count": 0.0, + "routers_loss": 0.0022409996017813683, + "skip_count": 1.0, + "step": 10432, + "text_loss": 0.33003750443458557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 48.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 1.1273632336700756e-06, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 16826676.0, + "repeat_count": 0.0, + "routers_loss": 0.0061734220944345, + "skip_count": 0.0, + "step": 10434, + "text_loss": 0.23123329877853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 48.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 1.106685743091862e-06, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 16829866.0, + "repeat_count": 0.0, + "routers_loss": 0.0038321982137858868, + "skip_count": 1.0, + "step": 10436, + "text_loss": 0.2427562028169632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 1.086199429372825e-06, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 16833765.0, + "repeat_count": 0.0, + "routers_loss": 0.00676750298589468, + "skip_count": 2.0, + "step": 10438, + "text_loss": 0.42610102891921997 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 1.0659043003632962e-06, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 16837285.0, + "repeat_count": 4.0, + "routers_loss": 0.007271626964211464, + "skip_count": 5.0, + "step": 10440, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.02348106838861, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 1.0458003638404434e-06, + "loss": 0.0043, + "macro_f1": 0.6603773832321167, + "num_tokens": 16840273.0, + "repeat_count": 1.0, + "routers_loss": 0.02480674348771572, + "skip_count": 1.0, + "step": 10442, + "text_loss": 0.445250540971756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 1.0258876275081043e-06, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 16844115.0, + "repeat_count": 0.0, + "routers_loss": 0.003030754392966628, + "skip_count": 1.0, + "step": 10444, + "text_loss": 0.5095187425613403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 1.0061660989969523e-06, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 16847375.0, + "repeat_count": 0.0, + "routers_loss": 0.006397911347448826, + "skip_count": 2.0, + "step": 10446, + "text_loss": 0.2943403720855713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 9.866357858642206e-07, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 16850247.0, + "repeat_count": 0.0, + "routers_loss": 0.007977386936545372, + "skip_count": 2.0, + "step": 10448, + "text_loss": 0.3035532832145691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 9.672966955940331e-07, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 16853383.0, + "repeat_count": 0.0, + "routers_loss": 0.003959330730140209, + "skip_count": 1.0, + "step": 10450, + "text_loss": 0.5030179619789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 9.481488355971291e-07, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 16856733.0, + "repeat_count": 0.0, + "routers_loss": 0.003481166437268257, + "skip_count": 1.0, + "step": 10452, + "text_loss": 0.6293197870254517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 9.29192213210972e-07, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 16859753.0, + "repeat_count": 0.0, + "routers_loss": 0.002803726587444544, + "skip_count": 0.0, + "step": 10454, + "text_loss": 0.6037408113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 9.104268356998624e-07, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 16862632.0, + "repeat_count": 0.0, + "routers_loss": 0.0033954589162021875, + "skip_count": 0.0, + "step": 10456, + "text_loss": 0.631564199924469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 8.918527102546592e-07, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 16866025.0, + "repeat_count": 0.0, + "routers_loss": 0.002237692940980196, + "skip_count": 0.0, + "step": 10458, + "text_loss": 0.18825361132621765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 8.734698439930577e-07, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 16869194.0, + "repeat_count": 1.0, + "routers_loss": 0.015361418016254902, + "skip_count": 2.0, + "step": 10460, + "text_loss": 0.15555702149868011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 49.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 8.552782439593121e-07, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 16872400.0, + "repeat_count": 0.0, + "routers_loss": 0.010845578275620937, + "skip_count": 4.0, + "step": 10462, + "text_loss": 0.2473229318857193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.372779171245681e-07, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 16874842.0, + "repeat_count": 0.0, + "routers_loss": 0.0031175457406789064, + "skip_count": 2.0, + "step": 10464, + "text_loss": 0.21604472398757935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 8.19468870386586e-07, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 16877754.0, + "repeat_count": 0.0, + "routers_loss": 0.00562032638117671, + "skip_count": 2.0, + "step": 10466, + "text_loss": 0.7601249814033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 8.018511105697957e-07, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 16880755.0, + "repeat_count": 0.0, + "routers_loss": 0.003651288105174899, + "skip_count": 0.0, + "step": 10468, + "text_loss": 0.15034520626068115 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.154975051364836, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 7.844246444253522e-07, + "loss": 0.0066, + "macro_f1": 0.8817967176437378, + "num_tokens": 16884024.0, + "repeat_count": 2.0, + "routers_loss": 0.03286674618721008, + "skip_count": 3.0, + "step": 10470, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 7.671894786310807e-07, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 16887512.0, + "repeat_count": 0.0, + "routers_loss": 0.005333275999873877, + "skip_count": 2.0, + "step": 10472, + "text_loss": 0.27574512362480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 7.501456197915868e-07, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 16890505.0, + "repeat_count": 0.0, + "routers_loss": 0.008608506061136723, + "skip_count": 2.0, + "step": 10474, + "text_loss": 0.110866978764534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 7.332930744380905e-07, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 16893806.0, + "repeat_count": 0.0, + "routers_loss": 0.001401562592945993, + "skip_count": 0.0, + "step": 10476, + "text_loss": 0.35479840636253357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 49.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 7.166318490284818e-07, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 16897076.0, + "repeat_count": 0.0, + "routers_loss": 0.00925722997635603, + "skip_count": 4.0, + "step": 10478, + "text_loss": 0.20996634662151337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 7.001619499474309e-07, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 16900449.0, + "repeat_count": 0.0, + "routers_loss": 0.0017279108287766576, + "skip_count": 0.0, + "step": 10480, + "text_loss": 0.6246579885482788 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 6.83883383506223e-07, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 16903275.0, + "repeat_count": 1.0, + "routers_loss": 0.0030049486085772514, + "skip_count": 0.0, + "step": 10482, + "text_loss": 0.4425566494464874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 6.677961559428125e-07, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 16906789.0, + "repeat_count": 0.0, + "routers_loss": 0.0028844536282122135, + "skip_count": 2.0, + "step": 10484, + "text_loss": 0.6716867685317993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 6.519002734218793e-07, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 16910099.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280550293624401, + "skip_count": 0.0, + "step": 10486, + "text_loss": 0.7250060439109802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 6.361957420347175e-07, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16912752.0, + "repeat_count": 0.0, + "routers_loss": 0.0034732224885374308, + "skip_count": 0.0, + "step": 10488, + "text_loss": 0.23244275152683258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 6.206825677993466e-07, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 16916677.0, + "repeat_count": 0.0, + "routers_loss": 0.004304944537580013, + "skip_count": 2.0, + "step": 10490, + "text_loss": 0.5831108093261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 6.053607566604557e-07, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 16919460.0, + "repeat_count": 0.0, + "routers_loss": 0.002612794516608119, + "skip_count": 0.0, + "step": 10492, + "text_loss": 0.2705974280834198 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 5.902303144894039e-07, + "loss": 0.0033, + "macro_f1": 1.0, + "num_tokens": 16922572.0, + "repeat_count": 2.0, + "routers_loss": 0.006487889215350151, + "skip_count": 4.0, + "step": 10494, + "text_loss": 0.23415961861610413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 5.752912470842198e-07, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 16925184.0, + "repeat_count": 0.0, + "routers_loss": 0.0015968878287822008, + "skip_count": 0.0, + "step": 10496, + "text_loss": 0.4943143427371979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 5.605435601695464e-07, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 16928598.0, + "repeat_count": 0.0, + "routers_loss": 0.0010248057078570127, + "skip_count": 0.0, + "step": 10498, + "text_loss": 0.36662834882736206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 5.459872593966963e-07, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 16931370.0, + "repeat_count": 0.0, + "routers_loss": 0.0012296726927161217, + "skip_count": 1.0, + "step": 10500, + "text_loss": 0.19061364233493805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 5.316223503437079e-07, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 16934631.0, + "repeat_count": 0.0, + "routers_loss": 0.0026445689145475626, + "skip_count": 0.0, + "step": 10502, + "text_loss": 0.1848333775997162 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 49.3146463164074, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 5.174488385152887e-07, + "loss": 0.0042, + "macro_f1": 0.8823530077934265, + "num_tokens": 16938097.0, + "repeat_count": 2.0, + "routers_loss": 0.011918487958610058, + "skip_count": 1.0, + "step": 10504, + "text_loss": 0.18828579783439636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 5.034667293427053e-07, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 16941223.0, + "repeat_count": 0.0, + "routers_loss": 0.004649503156542778, + "skip_count": 2.0, + "step": 10506, + "text_loss": 0.4231431484222412 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 49.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 4.896760281838942e-07, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 16944572.0, + "repeat_count": 2.0, + "routers_loss": 0.0019313854863867164, + "skip_count": 0.0, + "step": 10508, + "text_loss": 0.7520577311515808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 4.7607674032351666e-07, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 16947661.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383324887603521, + "skip_count": 0.0, + "step": 10510, + "text_loss": 0.6348366737365723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 4.626688709728488e-07, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 16951242.0, + "repeat_count": 0.0, + "routers_loss": 0.0007596072391606867, + "skip_count": 0.0, + "step": 10512, + "text_loss": 0.40759870409965515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 4.494524252698362e-07, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 16955082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018172853160649538, + "skip_count": 0.0, + "step": 10514, + "text_loss": 0.18837586045265198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 4.364274082789832e-07, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 16957903.0, + "repeat_count": 0.0, + "routers_loss": 0.003865955863147974, + "skip_count": 0.0, + "step": 10516, + "text_loss": 0.7716887593269348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 4.2359382499151945e-07, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 16960653.0, + "repeat_count": 0.0, + "routers_loss": 0.002676841337233782, + "skip_count": 0.0, + "step": 10518, + "text_loss": 0.5054554343223572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 4.1095168032534437e-07, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 16964472.0, + "repeat_count": 1.0, + "routers_loss": 0.0015017857076600194, + "skip_count": 0.0, + "step": 10520, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 3.985009791249161e-07, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 16967992.0, + "repeat_count": 0.0, + "routers_loss": 0.006224984303116798, + "skip_count": 0.0, + "step": 10522, + "text_loss": 0.26261746883392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 3.8624172616136265e-07, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 16971407.0, + "repeat_count": 0.0, + "routers_loss": 0.004404739011079073, + "skip_count": 2.0, + "step": 10524, + "text_loss": 0.37001657485961914 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 49.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 3.741739261324817e-07, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 16974847.0, + "repeat_count": 1.0, + "routers_loss": 0.0010904704686254263, + "skip_count": 1.0, + "step": 10526, + "text_loss": 0.3782288432121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 49.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 3.6229758366262967e-07, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 16977743.0, + "repeat_count": 0.0, + "routers_loss": 0.005372707732021809, + "skip_count": 3.0, + "step": 10528, + "text_loss": 0.2069653421640396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 3.506127033028883e-07, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 16982217.0, + "repeat_count": 0.0, + "routers_loss": 0.0017930102767422795, + "skip_count": 1.0, + "step": 10530, + "text_loss": 0.23420299589633942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 49.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 3.391192895308981e-07, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 16985440.0, + "repeat_count": 0.0, + "routers_loss": 0.01549626886844635, + "skip_count": 4.0, + "step": 10532, + "text_loss": 0.2651829421520233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 49.455532726739065, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0264892578125, + "learning_rate": 3.278173467509693e-07, + "loss": 0.0091, + "macro_f1": 0.6122449040412903, + "num_tokens": 16988716.0, + "repeat_count": 0.0, + "routers_loss": 0.014724464155733585, + "skip_count": 4.0, + "step": 10534, + "text_loss": 0.9998418688774109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 3.167068792940264e-07, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 16992076.0, + "repeat_count": 0.0, + "routers_loss": 0.00042533327359706163, + "skip_count": 0.0, + "step": 10536, + "text_loss": 0.7315229177474976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 3.057878914176082e-07, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 16994960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006869849166832864, + "skip_count": 0.0, + "step": 10538, + "text_loss": 1.1293457746505737 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 49.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 2.9506038730592323e-07, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 16997778.0, + "repeat_count": 2.0, + "routers_loss": 0.0009595098090358078, + "skip_count": 0.0, + "step": 10540, + "text_loss": 0.6721776723861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 2.845243710697387e-07, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 17001173.0, + "repeat_count": 0.0, + "routers_loss": 0.003649777267128229, + "skip_count": 0.0, + "step": 10542, + "text_loss": 0.44033801555633545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 2.741798467464918e-07, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 17005213.0, + "repeat_count": 0.0, + "routers_loss": 0.0005361626390367746, + "skip_count": 0.0, + "step": 10544, + "text_loss": 0.3993811309337616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 2.6402681830023365e-07, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 17008027.0, + "repeat_count": 0.0, + "routers_loss": 0.0047687748447060585, + "skip_count": 0.0, + "step": 10546, + "text_loss": 0.3197088837623596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 2.540652896215745e-07, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 17010934.0, + "repeat_count": 0.0, + "routers_loss": 0.003123556962236762, + "skip_count": 1.0, + "step": 10548, + "text_loss": 0.33580848574638367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0172119140625, + "learning_rate": 2.4429526452784955e-07, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 17014097.0, + "repeat_count": 0.0, + "routers_loss": 0.0012946722563356161, + "skip_count": 0.0, + "step": 10550, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 49.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 2.3471674676295296e-07, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 17017643.0, + "repeat_count": 1.0, + "routers_loss": 0.016718504950404167, + "skip_count": 1.0, + "step": 10552, + "text_loss": 0.23426192998886108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 2.2532973999733751e-07, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 17020900.0, + "repeat_count": 0.0, + "routers_loss": 0.0041206348687410355, + "skip_count": 1.0, + "step": 10554, + "text_loss": 0.15234927833080292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 2.1613424782812584e-07, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 17024504.0, + "repeat_count": 0.0, + "routers_loss": 0.002981720957905054, + "skip_count": 0.0, + "step": 10556, + "text_loss": 0.3161900043487549 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 2.0713027377911032e-07, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 17027482.0, + "repeat_count": 1.0, + "routers_loss": 0.004473469685763121, + "skip_count": 0.0, + "step": 10558, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 1.983178213005865e-07, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 17029843.0, + "repeat_count": 0.0, + "routers_loss": 0.0032688030041754246, + "skip_count": 0.0, + "step": 10560, + "text_loss": 0.5574228763580322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 1.8969689376951981e-07, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 17032786.0, + "repeat_count": 0.0, + "routers_loss": 0.004503661300987005, + "skip_count": 1.0, + "step": 10562, + "text_loss": 0.2402963787317276 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 1.8126749448943435e-07, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 17035399.0, + "repeat_count": 2.0, + "routers_loss": 0.00628487067297101, + "skip_count": 4.0, + "step": 10564, + "text_loss": 0.29870063066482544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 1.7302962669052402e-07, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 17038486.0, + "repeat_count": 0.0, + "routers_loss": 0.000694084505084902, + "skip_count": 0.0, + "step": 10566, + "text_loss": 0.5111265778541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 49.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 1.6498329352954143e-07, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 17042070.0, + "repeat_count": 2.0, + "routers_loss": 0.002611940260976553, + "skip_count": 2.0, + "step": 10568, + "text_loss": 0.4722840189933777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 1.5712849808985353e-07, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 17045164.0, + "repeat_count": 0.0, + "routers_loss": 0.0020359482150524855, + "skip_count": 1.0, + "step": 10570, + "text_loss": 0.5299108028411865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 1.494652433814414e-07, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 17048468.0, + "repeat_count": 0.0, + "routers_loss": 0.0017503987764939666, + "skip_count": 0.0, + "step": 10572, + "text_loss": 0.5245226621627808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 49.64338127384796, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.029296875, + "learning_rate": 1.4199353234090052e-07, + "loss": 0.0069, + "macro_f1": 0.9262410998344421, + "num_tokens": 17051716.0, + "repeat_count": 2.0, + "routers_loss": 0.02260318584740162, + "skip_count": 3.0, + "step": 10574, + "text_loss": 0.34682315587997437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 49.65277370120341, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 1.347133678313295e-07, + "loss": 0.0055, + "macro_f1": 0.6595745086669922, + "num_tokens": 17054642.0, + "repeat_count": 1.0, + "routers_loss": 0.04137809947133064, + "skip_count": 4.0, + "step": 10576, + "text_loss": 0.2545051574707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 1.2762475264260775e-07, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 17058378.0, + "repeat_count": 0.0, + "routers_loss": 0.006063911598175764, + "skip_count": 0.0, + "step": 10578, + "text_loss": 0.5370165109634399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 1.2072768949100698e-07, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 17061416.0, + "repeat_count": 0.0, + "routers_loss": 0.0011335996678099036, + "skip_count": 0.0, + "step": 10580, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 1.140221810195241e-07, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 17064236.0, + "repeat_count": 0.0, + "routers_loss": 0.0033164035994559526, + "skip_count": 0.0, + "step": 10582, + "text_loss": 0.2804311215877533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018798828125, + "learning_rate": 1.075082297977703e-07, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 17067446.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050394374877214, + "skip_count": 2.0, + "step": 10584, + "text_loss": 0.23257072269916534 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 1.0118583832186001e-07, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 17070365.0, + "repeat_count": 1.0, + "routers_loss": 0.003029540413990617, + "skip_count": 0.0, + "step": 10586, + "text_loss": 0.5026201605796814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 9.505500901457742e-08, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 17074316.0, + "repeat_count": 0.0, + "routers_loss": 0.0036497078835964203, + "skip_count": 0.0, + "step": 10588, + "text_loss": 0.6814579367637634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 8.911574422520997e-08, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 17077668.0, + "repeat_count": 0.0, + "routers_loss": 0.007934805005788803, + "skip_count": 1.0, + "step": 10590, + "text_loss": 0.14940814673900604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 8.336804622977034e-08, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 17080434.0, + "repeat_count": 0.0, + "routers_loss": 0.0012133397394791245, + "skip_count": 0.0, + "step": 10592, + "text_loss": 0.6377768516540527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 7.781191723071902e-08, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 17083362.0, + "repeat_count": 0.0, + "routers_loss": 0.0009114379645325243, + "skip_count": 1.0, + "step": 10594, + "text_loss": 0.41287705302238464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 7.244735935724167e-08, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 17086649.0, + "repeat_count": 0.0, + "routers_loss": 0.0018709978321567178, + "skip_count": 0.0, + "step": 10596, + "text_loss": 0.48996540904045105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 49.756090402113294, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.02783203125, + "learning_rate": 6.727437466497177e-08, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 17090390.0, + "repeat_count": 0.0, + "routers_loss": 0.017759598791599274, + "skip_count": 2.0, + "step": 10598, + "text_loss": 0.16886916756629944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 6.229296513621253e-08, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 17092940.0, + "repeat_count": 0.0, + "routers_loss": 0.0013719784328714013, + "skip_count": 0.0, + "step": 10600, + "text_loss": 0.6593959927558899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 49.77487525682419, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0220947265625, + "learning_rate": 5.7503132679936896e-08, + "loss": 0.0028, + "macro_f1": 0.5492662787437439, + "num_tokens": 17096497.0, + "repeat_count": 0.0, + "routers_loss": 0.012247482314705849, + "skip_count": 2.0, + "step": 10602, + "text_loss": 0.4913390874862671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 49.78426768417963, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 5.290487913156561e-08, + "loss": 0.0062, + "macro_f1": 0.6598639488220215, + "num_tokens": 17099651.0, + "repeat_count": 1.0, + "routers_loss": 0.013324257917702198, + "skip_count": 3.0, + "step": 10604, + "text_loss": 0.18341897428035736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 49.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 4.8498206253133614e-08, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 17102154.0, + "repeat_count": 0.0, + "routers_loss": 0.007073273416608572, + "skip_count": 3.0, + "step": 10606, + "text_loss": 0.5444790720939636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 4.4283115733290134e-08, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 17105299.0, + "repeat_count": 1.0, + "routers_loss": 0.001649016048759222, + "skip_count": 0.0, + "step": 10608, + "text_loss": 0.5396550297737122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 4.0259609187298654e-08, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 17108786.0, + "repeat_count": 0.0, + "routers_loss": 0.00029506805003620684, + "skip_count": 0.0, + "step": 10610, + "text_loss": 0.5690585374832153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.821837393601406, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 3.6427688156981384e-08, + "loss": 0.0077, + "macro_f1": 0.6601307392120361, + "num_tokens": 17111796.0, + "repeat_count": 1.0, + "routers_loss": 0.04010998085141182, + "skip_count": 2.0, + "step": 10612, + "text_loss": 0.3106518089771271 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 3.2787354110663804e-08, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 17114975.0, + "repeat_count": 1.0, + "routers_loss": 0.013439279049634933, + "skip_count": 2.0, + "step": 10614, + "text_loss": 0.19681362807750702 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 49.8406222483123, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 2.933860844345215e-08, + "loss": 0.0061, + "macro_f1": 0.9265305995941162, + "num_tokens": 17118217.0, + "repeat_count": 3.0, + "routers_loss": 0.020981203764677048, + "skip_count": 1.0, + "step": 10616, + "text_loss": 0.6071886420249939 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 2.6081452476789392e-08, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 17121656.0, + "repeat_count": 1.0, + "routers_loss": 0.013027530163526535, + "skip_count": 3.0, + "step": 10618, + "text_loss": 0.21379177272319794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 2.3015887458899266e-08, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 17124584.0, + "repeat_count": 0.0, + "routers_loss": 0.0056997365318238735, + "skip_count": 2.0, + "step": 10620, + "text_loss": 0.22514000535011292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.86879953037863, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 2.0141914564453245e-08, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 17127853.0, + "repeat_count": 0.0, + "routers_loss": 0.016820410266518593, + "skip_count": 1.0, + "step": 10622, + "text_loss": 0.22637426853179932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 1.745953489479257e-08, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 17131958.0, + "repeat_count": 0.0, + "routers_loss": 0.0029321140609681606, + "skip_count": 0.0, + "step": 10624, + "text_loss": 0.3751795291900635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 1.4968749477872744e-08, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 17135482.0, + "repeat_count": 1.0, + "routers_loss": 0.0027504474855959415, + "skip_count": 0.0, + "step": 10626, + "text_loss": 0.3414074778556824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 1.2669559268041475e-08, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 17138415.0, + "repeat_count": 0.0, + "routers_loss": 0.0012815104564651847, + "skip_count": 1.0, + "step": 10628, + "text_loss": 0.4166540801525116 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 1.0561965146482777e-08, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 17142241.0, + "repeat_count": 2.0, + "routers_loss": 0.010521184653043747, + "skip_count": 4.0, + "step": 10630, + "text_loss": 0.3614460825920105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 8.645967920717369e-09, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 17145305.0, + "repeat_count": 0.0, + "routers_loss": 0.002076479373499751, + "skip_count": 0.0, + "step": 10632, + "text_loss": 0.4676922857761383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 49.9251540945113, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.03173828125, + "learning_rate": 6.921568325046756e-09, + "loss": 0.0064, + "macro_f1": 0.9555556178092957, + "num_tokens": 17149574.0, + "repeat_count": 1.0, + "routers_loss": 0.020939050242304802, + "skip_count": 5.0, + "step": 10634, + "text_loss": 0.4579739570617676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 5.388767020220176e-09, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 17152561.0, + "repeat_count": 0.0, + "routers_loss": 0.0007589405868202448, + "skip_count": 0.0, + "step": 10636, + "text_loss": 0.531318187713623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 49.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 4.047564593601116e-09, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 17155284.0, + "repeat_count": 1.0, + "routers_loss": 0.0013623902341350913, + "skip_count": 2.0, + "step": 10638, + "text_loss": 0.533105194568634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 49.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 2.8979615591673283e-09, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 17158345.0, + "repeat_count": 0.0, + "routers_loss": 0.008068135008215904, + "skip_count": 3.0, + "step": 10640, + "text_loss": 0.2997605800628662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 1.939958357455307e-09, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 17161982.0, + "repeat_count": 0.0, + "routers_loss": 0.006473845802247524, + "skip_count": 2.0, + "step": 10642, + "text_loss": 0.24127982556819916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 49.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 1.1735553555602963e-09, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 17166156.0, + "repeat_count": 0.0, + "routers_loss": 0.000686702027451247, + "skip_count": 0.0, + "step": 10644, + "text_loss": 0.5044453144073486 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 49.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 5.987528471362857e-10, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 17169311.0, + "repeat_count": 1.0, + "routers_loss": 0.0015337419463321567, + "skip_count": 0.0, + "step": 10646, + "text_loss": 0.7889845371246338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 49.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 2.1555105250703476e-10, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 17171990.0, + "repeat_count": 0.0, + "routers_loss": 0.0028676397632807493, + "skip_count": 2.0, + "step": 10648, + "text_loss": 0.4312690794467926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 50.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 2.395011849953832e-11, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 17175100.0, + "repeat_count": 1.0, + "routers_loss": 0.0016953344456851482, + "skip_count": 0.0, + "step": 10650, + "text_loss": 0.2874845862388611 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.916096880361167e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10650/training_args.bin b/checkpoint-10650/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-10650/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880 diff --git a/checkpoint-2000/model-00002-of-00002.safetensors b/checkpoint-2000/model-00002-of-00002.safetensors index 892caec7a1b07ac8579989849dcbdbaa51cbb1be..8de1a284813a836fcdcc8636624510f406abe203 100644 --- a/checkpoint-2000/model-00002-of-00002.safetensors +++ b/checkpoint-2000/model-00002-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0abc54f09e3d7a9e90771cb6b93f11508f30c7eaa03a0fd91cbba011629d9925 +oid sha256:a56bbbae6071c88ab17f5e1938dd6e10a779f5f8d5c5d7800a83096e7dc5cab2 size 1481790520 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt index c77d4f5a912cdc04592164edac03612621ef90ec..c94c2b2394bf4185a8aad72c35646c7836d48eab 100644 --- a/checkpoint-2000/optimizer.pt +++ b/checkpoint-2000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4842a0c1ea33f6a8e2147db0772c3310a9d361f955e89265e54b367b8904402 +oid sha256:24cc1cbc00725be45237fa31c2687929be78debb22d0f2fffda8a79fcca60778 size 44191162 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json index ddc4fe0f098b52cf925457a0c5b2f81a1f624b41..51f9545fe8fe57be9c7cd88438c3f257b3e4de47 100644 --- a/checkpoint-2000/trainer_state.json +++ b/checkpoint-2000/trainer_state.json @@ -12,18 +12,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 25.0, "epoch": 0.009392427355444672, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.40625, + "grad_norm": 2.25, "learning_rate": 2e-06, - "loss": 0.5484, - "macro_f1": 0.1621621698141098, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, - "routers_loss": 0.503563642501831, + "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 @@ -31,18 +31,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 23.0, "epoch": 0.018784854710889344, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.9140625, + "grad_norm": 1.8359375, "learning_rate": 6e-06, - "loss": 0.536, - "macro_f1": 0.1621621698141098, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, - "routers_loss": 0.4589468538761139, + "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 @@ -50,37 +50,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 28.0, "epoch": 0.02817728206633402, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.375, + "grad_norm": 2.234375, "learning_rate": 1e-05, - "loss": 0.5469, - "macro_f1": 0.19999998807907104, + "loss": 0.5113, + "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, - "routers_loss": 0.5736724138259888, + "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { - "acc_repeat": 1.0, - "acc_skip": 0.5, - "avg_layers": 33.0, + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 0.03756970942177869, - "f1_execute": 0.47058823704719543, - "f1_repeat": 0.1538461595773697, - "f1_skip": 0.222222238779068, - "grad_norm": 1.8515625, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, "learning_rate": 1.4e-05, - "loss": 0.5291, - "macro_f1": 0.28221890330314636, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, - "routers_loss": 0.49970296025276184, + "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 @@ -88,37 +88,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.046962136777223364, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.953125, + "grad_norm": 1.78125, "learning_rate": 1.8e-05, - "loss": 0.5316, - "macro_f1": 0.19999998807907104, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, - "routers_loss": 0.5153562426567078, + "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 34.0, + "avg_layers": 26.0, "epoch": 0.05635456413266804, - "f1_execute": 0.5714285373687744, - "f1_repeat": 0.0, - "f1_skip": 0.25, - "grad_norm": 1.6328125, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, "learning_rate": 2.2e-05, - "loss": 0.5051, - "macro_f1": 0.2738095223903656, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, - "routers_loss": 0.46214747428894043, + "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 @@ -126,37 +126,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.06574699148811271, - "f1_execute": 0.5263157486915588, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.6e-05, - "loss": 0.5653, - "macro_f1": 0.17543858289718628, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, - "routers_loss": 0.5300976634025574, + "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 34.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 0.07513941884355738, - "f1_execute": 0.6153846383094788, + "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 1.8828125, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, "learning_rate": 3e-05, - "loss": 0.5225, - "macro_f1": 0.20512822270393372, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, - "routers_loss": 0.473240464925766, + "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 @@ -164,18 +164,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 38.0, + "avg_layers": 27.0, "epoch": 0.08453184619900206, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.6015625, + "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4867, - "macro_f1": 0.19999998807907104, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, - "routers_loss": 0.4795944094657898, + "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 @@ -183,18 +183,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 36.0, + "avg_layers": 26.0, "epoch": 0.09392427355444673, - "f1_execute": 0.6153846383094788, - "f1_repeat": 0.1538461595773697, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, - "grad_norm": 1.3984375, + "grad_norm": 1.3125, "learning_rate": 3.8e-05, - "loss": 0.4718, - "macro_f1": 0.25641027092933655, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, - "routers_loss": 0.41872408986091614, + "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 @@ -202,18 +202,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 26.0, "epoch": 0.1033167009098914, - "f1_execute": 0.6341463327407837, + "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.7734375, + "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, - "loss": 0.4472, - "macro_f1": 0.21138212084770203, + "loss": 0.404, + "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, - "routers_loss": 0.4152105450630188, + "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 @@ -221,18 +221,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 27.0, "epoch": 0.11270912826533608, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.8046875, + "grad_norm": 1.6328125, "learning_rate": 4.6e-05, - "loss": 0.4554, - "macro_f1": 0.19999998807907104, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, - "routers_loss": 0.47541096806526184, + "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 @@ -240,18 +240,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 34.0, + "avg_layers": 27.0, "epoch": 0.12210155562078075, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.875, + "grad_norm": 1.7109375, "learning_rate": 5e-05, - "loss": 0.4182, - "macro_f1": 0.2608695924282074, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, - "routers_loss": 0.37319275736808777, + "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 @@ -259,18 +259,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.13149398297622542, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4375, + "grad_norm": 1.34375, "learning_rate": 5.4e-05, - "loss": 0.3991, - "macro_f1": 0.2608695924282074, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, - "routers_loss": 0.3604123294353485, + "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 @@ -280,16 +280,16 @@ "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, - "f1_execute": 0.8979591727256775, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.421875, + "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, - "loss": 0.3827, - "macro_f1": 0.2993197441101074, + "loss": 0.341, + "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, - "routers_loss": 0.35880225896835327, + "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 @@ -297,18 +297,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 0.15027883768711475, - "f1_execute": 0.9200000166893005, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 6.2e-05, - "loss": 0.3452, - "macro_f1": 0.30666667222976685, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, - "routers_loss": 0.31086465716362, + "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 @@ -316,18 +316,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.15967126504255943, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3671875, + "grad_norm": 1.46875, "learning_rate": 6.6e-05, - "loss": 0.3283, - "macro_f1": 0.3144654333591461, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, - "routers_loss": 0.2674171030521393, + "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 @@ -335,18 +335,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.16906369239800412, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1015625, + "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, - "loss": 0.2849, - "macro_f1": 0.3205128312110901, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, - "routers_loss": 0.24587315320968628, + "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 @@ -354,18 +354,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 27.0, "epoch": 0.17845611975344877, - "f1_execute": 0.8085106015205383, + "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3046875, + "grad_norm": 1.484375, "learning_rate": 7.4e-05, - "loss": 0.2616, - "macro_f1": 0.26950353384017944, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, - "routers_loss": 0.32050269842147827, + "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 @@ -373,18 +373,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.18784854710889345, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1796875, + "grad_norm": 1.3828125, "learning_rate": 7.8e-05, - "loss": 0.2084, - "macro_f1": 0.3144654333591461, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, - "routers_loss": 0.15196125209331512, + "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 @@ -392,18 +392,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.61328125, + "grad_norm": 0.78125, "learning_rate": 8.2e-05, - "loss": 0.1947, + "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, - "routers_loss": 0.14121046662330627, + "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 @@ -416,13 +416,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.50390625, + "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, - "loss": 0.1884, + "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, - "routers_loss": 0.21312278509140015, + "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 @@ -435,13 +435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.45703125, + "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, - "loss": 0.166, + "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, - "routers_loss": 0.1184137836098671, + "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 @@ -454,13 +454,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.62890625, + "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, - "loss": 0.1313, + "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, - "routers_loss": 0.10897563397884369, + "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 @@ -468,18 +468,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.2348106838861168, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.4375, + "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, - "loss": 0.1531, - "macro_f1": 0.3272727429866791, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, - "routers_loss": 0.09979952871799469, + "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 @@ -487,18 +487,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.2442031112415615, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.515625, + "grad_norm": 0.8515625, "learning_rate": 0.000102, - "loss": 0.1265, - "macro_f1": 0.3272727429866791, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, - "routers_loss": 0.05543195456266403, + "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 @@ -511,13 +511,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.328125, + "grad_norm": 0.421875, "learning_rate": 0.000106, - "loss": 0.1436, + "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, - "routers_loss": 0.15049344301223755, + "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 @@ -530,13 +530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.263671875, + "grad_norm": 0.35546875, "learning_rate": 0.00011, - "loss": 0.1021, + "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, - "routers_loss": 0.07367338240146637, + "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 @@ -544,18 +544,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 26.0, "epoch": 0.2723803933078955, - "f1_execute": 1.0, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25, + "grad_norm": 0.271484375, "learning_rate": 0.000114, - "loss": 0.114, - "macro_f1": 0.3333333432674408, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, - "routers_loss": 0.03782692551612854, + "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 @@ -568,13 +568,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.5390625, "learning_rate": 0.000118, - "loss": 0.1197, + "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, - "routers_loss": 0.14074955880641937, + "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 @@ -587,13 +587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2353515625, "learning_rate": 0.000122, - "loss": 0.1174, + "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, - "routers_loss": 0.058013737201690674, + "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 @@ -606,13 +606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.212890625, "learning_rate": 0.000126, - "loss": 0.0911, + "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, - "routers_loss": 0.04936821386218071, + "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 @@ -625,13 +625,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.220703125, + "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, - "loss": 0.1107, + "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, - "routers_loss": 0.2628525495529175, + "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 @@ -644,13 +644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000134, - "loss": 0.1109, + "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, - "routers_loss": 0.02859785594046116, + "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 @@ -663,13 +663,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, - "loss": 0.1067, + "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, - "routers_loss": 0.10459086298942566, + "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 @@ -682,13 +682,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2109375, + "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, - "loss": 0.1166, + "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, - "routers_loss": 0.0718551054596901, + "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 @@ -701,13 +701,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1787109375, "learning_rate": 0.000146, - "loss": 0.1007, + "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, - "routers_loss": 0.1850946843624115, + "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 @@ -720,13 +720,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34375, + "grad_norm": 0.333984375, "learning_rate": 0.00015, - "loss": 0.1019, + "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, - "routers_loss": 0.09809529036283493, + "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 @@ -739,13 +739,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.26171875, "learning_rate": 0.000154, - "loss": 0.1088, + "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, - "routers_loss": 0.11277207732200623, + "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 @@ -758,13 +758,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.255859375, "learning_rate": 0.000158, - "loss": 0.0866, + "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, - "routers_loss": 0.09079254418611526, + "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 @@ -777,13 +777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1943359375, "learning_rate": 0.000162, - "loss": 0.0928, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, - "routers_loss": 0.02900076098740101, + "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 @@ -796,13 +796,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, - "loss": 0.1251, + "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, - "routers_loss": 0.0763339251279831, + "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 @@ -817,11 +817,11 @@ "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, - "loss": 0.1064, + "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, - "routers_loss": 0.13191410899162292, + "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 @@ -834,13 +834,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25390625, "learning_rate": 0.000174, - "loss": 0.1055, + "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, - "routers_loss": 0.21200031042099, + "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 @@ -853,13 +853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.349609375, "learning_rate": 0.000178, - "loss": 0.0971, + "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, - "routers_loss": 0.031911369413137436, + "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 @@ -872,13 +872,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2265625, "learning_rate": 0.000182, - "loss": 0.1056, + "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, - "routers_loss": 0.14131835103034973, + "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 @@ -891,13 +891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.205078125, "learning_rate": 0.000186, - "loss": 0.1059, + "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, - "routers_loss": 0.04137955233454704, + "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 @@ -910,13 +910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.2138671875, "learning_rate": 0.00019, - "loss": 0.0934, + "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, - "routers_loss": 0.03163003921508789, + "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 @@ -929,13 +929,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2099609375, "learning_rate": 0.000194, - "loss": 0.0847, + "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, - "routers_loss": 0.2567490339279175, + "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 @@ -948,13 +948,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30859375, + "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, - "loss": 0.1077, + "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, - "routers_loss": 0.11468870937824249, + "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 @@ -967,13 +967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1806640625, "learning_rate": 0.000202, - "loss": 0.1131, + "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, - "routers_loss": 0.02124219387769699, + "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 @@ -986,13 +986,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1943359375, "learning_rate": 0.000206, - "loss": 0.0624, + "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, - "routers_loss": 0.06983796507120132, + "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 @@ -1005,13 +1005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1591796875, "learning_rate": 0.00021, - "loss": 0.0951, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, - "routers_loss": 0.03467355668544769, + "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 @@ -1024,13 +1024,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.240234375, "learning_rate": 0.000214, - "loss": 0.0881, + "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, - "routers_loss": 0.08142061531543732, + "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 @@ -1043,13 +1043,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.267578125, "learning_rate": 0.000218, - "loss": 0.0795, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, - "routers_loss": 0.08327355235815048, + "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 @@ -1062,13 +1062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.2353515625, "learning_rate": 0.000222, - "loss": 0.0943, + "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, - "routers_loss": 0.019890006631612778, + "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 @@ -1081,13 +1081,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, - "loss": 0.0933, + "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, - "routers_loss": 0.09992363303899765, + "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 @@ -1100,13 +1100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.3046875, "learning_rate": 0.00023, - "loss": 0.0762, + "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, - "routers_loss": 0.014119029976427555, + "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 @@ -1119,13 +1119,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.423828125, + "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, - "loss": 0.0842, + "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, - "routers_loss": 0.03976766765117645, + "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 @@ -1138,13 +1138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, - "loss": 0.0517, + "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, - "routers_loss": 0.017428619787096977, + "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 @@ -1157,13 +1157,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.35546875, "learning_rate": 0.000242, - "loss": 0.1134, + "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, - "routers_loss": 0.06965513527393341, + "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 @@ -1176,13 +1176,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1875, "learning_rate": 0.000246, - "loss": 0.0984, + "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, - "routers_loss": 0.10476501286029816, + "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 @@ -1195,13 +1195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.00025, - "loss": 0.0771, + "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, - "routers_loss": 0.028317544609308243, + "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 @@ -1214,13 +1214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.390625, + "grad_norm": 0.4296875, "learning_rate": 0.000254, - "loss": 0.0933, + "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, - "routers_loss": 0.012766432017087936, + "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 @@ -1233,13 +1233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, - "loss": 0.0989, + "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, - "routers_loss": 0.021400077268481255, + "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 @@ -1252,13 +1252,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.2060546875, "learning_rate": 0.000262, - "loss": 0.0873, + "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, - "routers_loss": 0.05025051161646843, + "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 @@ -1271,13 +1271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1708984375, "learning_rate": 0.000266, - "loss": 0.085, + "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, - "routers_loss": 0.017420046031475067, + "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 @@ -1290,13 +1290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.220703125, "learning_rate": 0.00027, - "loss": 0.086, + "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, - "routers_loss": 0.018217921257019043, + "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 @@ -1309,13 +1309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, - "loss": 0.0985, + "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, - "routers_loss": 0.012350660748779774, + "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 @@ -1328,13 +1328,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, - "routers_loss": 0.14993029832839966, + "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 @@ -1347,13 +1347,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, - "loss": 0.0791, + "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, - "routers_loss": 0.17921413481235504, + "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 @@ -1366,13 +1366,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, - "loss": 0.0535, + "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, - "routers_loss": 0.1420905590057373, + "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 @@ -1385,13 +1385,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.29296875, + "grad_norm": 0.306640625, "learning_rate": 0.00029, - "loss": 0.0956, + "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, - "routers_loss": 0.12468750029802322, + "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 @@ -1404,13 +1404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1806640625, "learning_rate": 0.000294, - "loss": 0.0879, + "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, - "routers_loss": 0.024295611307024956, + "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 @@ -1423,13 +1423,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.000298, - "loss": 0.087, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, - "routers_loss": 0.07016433775424957, + "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 @@ -1442,13 +1442,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3828125, + "grad_norm": 0.37890625, "learning_rate": 0.000302, - "loss": 0.0782, + "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, - "routers_loss": 0.18942493200302124, + "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 @@ -1461,13 +1461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1787109375, "learning_rate": 0.000306, - "loss": 0.0713, + "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, - "routers_loss": 0.02319060079753399, + "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 @@ -1480,13 +1480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15234375, + "grad_norm": 0.1533203125, "learning_rate": 0.00031, - "loss": 0.0778, + "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, - "routers_loss": 0.01764747127890587, + "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 @@ -1499,13 +1499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1572265625, "learning_rate": 0.000314, - "loss": 0.0829, + "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, - "routers_loss": 0.02268100716173649, + "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 @@ -1518,13 +1518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, - "loss": 0.0889, + "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, - "routers_loss": 0.016952091827988625, + "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 @@ -1537,13 +1537,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2216796875, + "grad_norm": 0.224609375, "learning_rate": 0.000322, - "loss": 0.0923, + "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, - "routers_loss": 0.03669808804988861, + "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 @@ -1556,13 +1556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.212890625, "learning_rate": 0.000326, - "loss": 0.0769, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, - "routers_loss": 0.012101447209715843, + "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 @@ -1575,13 +1575,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.37109375, + "grad_norm": 0.408203125, "learning_rate": 0.00033, - "loss": 0.0897, + "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, - "routers_loss": 0.1562056541442871, + "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 @@ -1594,13 +1594,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, - "loss": 0.0829, + "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, - "routers_loss": 0.20807914435863495, + "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 @@ -1613,13 +1613,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, - "loss": 0.0987, + "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, - "routers_loss": 0.1530539095401764, + "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 @@ -1632,13 +1632,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.17578125, "learning_rate": 0.000342, - "loss": 0.087, + "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, - "routers_loss": 0.08004544675350189, + "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 @@ -1651,13 +1651,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.189453125, "learning_rate": 0.000346, - "loss": 0.0916, + "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, - "routers_loss": 0.19228078424930573, + "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 @@ -1670,13 +1670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1494140625, "learning_rate": 0.00035, - "loss": 0.0863, + "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, - "routers_loss": 0.024507170543074608, + "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 @@ -1689,13 +1689,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2158203125, "learning_rate": 0.000354, - "loss": 0.0898, + "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, - "routers_loss": 0.05055495724081993, + "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 @@ -1708,13 +1708,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.240234375, "learning_rate": 0.000358, - "loss": 0.0865, + "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, - "routers_loss": 0.03999815881252289, + "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 @@ -1727,13 +1727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.14453125, "learning_rate": 0.000362, - "loss": 0.0983, + "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, - "routers_loss": 0.025158070027828217, + "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 @@ -1746,32 +1746,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.158203125, "learning_rate": 0.000366, - "loss": 0.1015, + "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, - "routers_loss": 0.01825365424156189, + "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 0.8734957440563546, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, "learning_rate": 0.00037, - "loss": 0.0736, - "macro_f1": 0.3144654333591461, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, - "routers_loss": 0.22729666531085968, + "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 @@ -1784,13 +1784,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2041015625, "learning_rate": 0.000374, - "loss": 0.0838, + "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, - "routers_loss": 0.24516475200653076, + "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 @@ -1803,13 +1803,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2470703125, + "grad_norm": 0.271484375, "learning_rate": 0.000378, - "loss": 0.1056, + "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, - "routers_loss": 0.1307530701160431, + "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 @@ -1822,13 +1822,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15625, "learning_rate": 0.000382, - "loss": 0.0961, + "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, - "routers_loss": 0.06541688740253448, + "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 @@ -1841,13 +1841,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.34375, "learning_rate": 0.000386, - "loss": 0.1058, + "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, - "routers_loss": 0.12492545694112778, + "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 @@ -1860,13 +1860,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28515625, + "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, - "loss": 0.0966, + "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, - "routers_loss": 0.2838033139705658, + "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 @@ -1881,11 +1881,11 @@ "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, - "loss": 0.0929, + "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, - "routers_loss": 0.07692629098892212, + "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 @@ -1898,13 +1898,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, - "routers_loss": 0.18504399061203003, + "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 @@ -1917,13 +1917,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2490234375, "learning_rate": 0.000402, - "loss": 0.078, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, - "routers_loss": 0.014647359028458595, + "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 @@ -1936,13 +1936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, - "loss": 0.1028, + "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, - "routers_loss": 0.017848484218120575, + "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 @@ -1955,13 +1955,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.27734375, "learning_rate": 0.00041, - "loss": 0.0832, + "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, - "routers_loss": 0.01900508813560009, + "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 @@ -1974,13 +1974,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, - "routers_loss": 0.13018715381622314, + "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 @@ -1993,13 +1993,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, - "loss": 0.0697, + "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, - "routers_loss": 0.055288366973400116, + "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 @@ -2012,13 +2012,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.271484375, "learning_rate": 0.000422, - "loss": 0.0576, + "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, - "routers_loss": 0.10952572524547577, + "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 @@ -2031,13 +2031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.197265625, + "grad_norm": 0.2060546875, "learning_rate": 0.000426, - "loss": 0.062, + "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, - "routers_loss": 0.02415696159005165, + "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 @@ -2050,13 +2050,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.00043, - "loss": 0.1011, + "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, - "routers_loss": 0.06956391036510468, + "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 @@ -2069,13 +2069,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, - "loss": 0.076, + "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, - "routers_loss": 0.1140352189540863, + "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 @@ -2090,11 +2090,11 @@ "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, - "loss": 0.0788, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, - "routers_loss": 0.011621571145951748, + "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 @@ -2107,13 +2107,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, - "routers_loss": 0.05813701078295708, + "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 @@ -2126,13 +2126,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.40234375, "learning_rate": 0.000446, - "loss": 0.0827, + "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, - "routers_loss": 0.0646885335445404, + "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 @@ -2145,13 +2145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, - "loss": 0.1011, + "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, - "routers_loss": 0.07224348932504654, + "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 @@ -2164,13 +2164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, - "loss": 0.0781, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, - "routers_loss": 0.015971746295690536, + "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 @@ -2183,13 +2183,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25, "learning_rate": 0.000458, - "loss": 0.099, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, - "routers_loss": 0.017818331718444824, + "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 @@ -2202,13 +2202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1494140625, "learning_rate": 0.000462, - "loss": 0.0757, + "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, - "routers_loss": 0.01582280732691288, + "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 @@ -2221,13 +2221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.42578125, + "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, - "loss": 0.0876, + "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, - "routers_loss": 0.011417915113270283, + "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 @@ -2240,13 +2240,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.17578125, "learning_rate": 0.00047, - "loss": 0.0801, + "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, - "routers_loss": 0.05787832289934158, + "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 @@ -2259,13 +2259,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.267578125, "learning_rate": 0.000474, - "loss": 0.0508, + "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, - "routers_loss": 0.09476690739393234, + "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 @@ -2278,13 +2278,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, - "loss": 0.0833, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, - "routers_loss": 0.1099705696105957, + "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 @@ -2297,13 +2297,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.171875, "learning_rate": 0.000482, - "loss": 0.0745, + "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, - "routers_loss": 0.01269970741122961, + "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 @@ -2316,13 +2316,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.12060546875, "learning_rate": 0.000486, - "loss": 0.061, + "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, - "routers_loss": 0.08505752682685852, + "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 @@ -2335,13 +2335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1552734375, "learning_rate": 0.00049, - "loss": 0.0504, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, - "routers_loss": 0.012750142253935337, + "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 @@ -2354,13 +2354,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.291015625, + "grad_norm": 0.296875, "learning_rate": 0.000494, - "loss": 0.0962, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, - "routers_loss": 0.11287309974431992, + "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 @@ -2373,32 +2373,32 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.203125, "learning_rate": 0.000498, - "loss": 0.0821, + "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, - "routers_loss": 0.1486474722623825, + "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.183152333431171, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, + "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, - "loss": 0.0832, - "macro_f1": 0.5492662787437439, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, - "routers_loss": 0.06636594980955124, + "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 @@ -2411,13 +2411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.267578125, + "grad_norm": 0.287109375, "learning_rate": 0.000506, - "loss": 0.1, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, - "routers_loss": 0.015062150545418262, + "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 @@ -2430,13 +2430,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.21484375, "learning_rate": 0.00051, - "loss": 0.0808, + "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, - "routers_loss": 0.2051105946302414, + "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 @@ -2449,13 +2449,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.2421875, "learning_rate": 0.000514, - "loss": 0.068, + "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, - "routers_loss": 0.1467045396566391, + "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 @@ -2468,13 +2468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1533203125, "learning_rate": 0.000518, - "loss": 0.0543, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, - "routers_loss": 0.013022038154304028, + "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 @@ -2487,13 +2487,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2294921875, "learning_rate": 0.000522, - "loss": 0.0848, + "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, - "routers_loss": 0.2575930058956146, + "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 @@ -2506,13 +2506,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.17578125, "learning_rate": 0.000526, - "loss": 0.07, + "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, - "routers_loss": 0.0558602549135685, + "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 @@ -2525,13 +2525,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, - "loss": 0.082, + "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, - "routers_loss": 0.09126655012369156, + "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 @@ -2544,13 +2544,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, - "loss": 0.0764, + "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, - "routers_loss": 0.24805288016796112, + "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 @@ -2563,13 +2563,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, - "loss": 0.0686, + "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, - "routers_loss": 0.13135533034801483, + "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 @@ -2582,13 +2582,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, - "loss": 0.1083, + "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, - "routers_loss": 0.04991440102458, + "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 @@ -2601,13 +2601,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.455078125, + "grad_norm": 0.44921875, "learning_rate": 0.000546, - "loss": 0.0991, + "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, - "routers_loss": 0.12236632406711578, + "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 @@ -2620,13 +2620,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25, + "grad_norm": 0.2578125, "learning_rate": 0.00055, - "loss": 0.0936, + "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, - "routers_loss": 0.053506772965192795, + "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 @@ -2639,13 +2639,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.29296875, "learning_rate": 0.000554, - "loss": 0.066, + "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, - "routers_loss": 0.13446088135242462, + "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 @@ -2658,32 +2658,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.185546875, "learning_rate": 0.000558, - "loss": 0.0682, + "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, - "routers_loss": 0.07270720601081848, + "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.3240387437628411, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.28125, + "f1_skip": 0.0, + "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, - "loss": 0.0648, - "macro_f1": 0.5427350401878357, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, - "routers_loss": 0.13866399228572845, + "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 @@ -2696,13 +2696,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.265625, "learning_rate": 0.000566, - "loss": 0.0782, + "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, - "routers_loss": 0.0645354762673378, + "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 @@ -2715,13 +2715,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1650390625, "learning_rate": 0.00057, - "loss": 0.0892, + "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, - "routers_loss": 0.05967628210783005, + "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 @@ -2734,13 +2734,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2158203125, "learning_rate": 0.000574, - "loss": 0.0676, + "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, - "routers_loss": 0.06438407301902771, + "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 @@ -2753,13 +2753,13 @@ "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28515625, "learning_rate": 0.000578, - "loss": 0.0781, + "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, - "routers_loss": 0.21225209534168243, + "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 @@ -2772,13 +2772,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, - "loss": 0.0664, + "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, - "routers_loss": 0.08085516840219498, + "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 @@ -2791,13 +2791,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, - "loss": 0.0874, + "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, - "routers_loss": 0.05378658324480057, + "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 @@ -2810,13 +2810,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.2177734375, "learning_rate": 0.00059, - "loss": 0.0715, + "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, - "routers_loss": 0.01145261898636818, + "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 @@ -2831,11 +2831,11 @@ "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, - "loss": 0.0737, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, - "routers_loss": 0.009397956542670727, + "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 @@ -2848,13 +2848,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.181640625, "learning_rate": 0.000598, - "loss": 0.0802, + "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, - "routers_loss": 0.2389357089996338, + "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 @@ -2862,18 +2862,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 1.417963017317288, - "f1_execute": 0.9019607901573181, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2080078125, "learning_rate": 0.000602, - "loss": 0.0745, - "macro_f1": 0.3006536066532135, + "loss": 0.073, + "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, - "routers_loss": 0.18252353370189667, + "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 @@ -2886,13 +2886,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.27734375, + "grad_norm": 0.279296875, "learning_rate": 0.000606, - "loss": 0.0935, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, - "routers_loss": 0.18185268342494965, + "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 @@ -2905,13 +2905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1943359375, "learning_rate": 0.00061, - "loss": 0.0853, + "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, - "routers_loss": 0.013210167177021503, + "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 @@ -2924,13 +2924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.26953125, "learning_rate": 0.000614, - "loss": 0.1089, + "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, - "routers_loss": 0.016936838626861572, + "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 @@ -2943,13 +2943,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, - "loss": 0.077, + "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, - "routers_loss": 0.08630389720201492, + "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 @@ -2962,13 +2962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.19140625, "learning_rate": 0.000622, - "loss": 0.0602, + "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, - "routers_loss": 0.013665963895618916, + "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 @@ -2981,13 +2981,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.205078125, "learning_rate": 0.000626, - "loss": 0.0794, + "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, - "routers_loss": 0.01584783010184765, + "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 @@ -3000,13 +3000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2216796875, "learning_rate": 0.00063, - "loss": 0.0762, + "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, - "routers_loss": 0.01368923019617796, + "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 @@ -3019,13 +3019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.388671875, + "grad_norm": 0.400390625, "learning_rate": 0.000634, - "loss": 0.0908, + "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, - "routers_loss": 0.009135022759437561, + "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 @@ -3038,13 +3038,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.15234375, "learning_rate": 0.000638, - "loss": 0.0949, + "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, - "routers_loss": 0.046641621738672256, + "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 @@ -3052,18 +3052,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.5118872908717347, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.259765625, "learning_rate": 0.000642, - "loss": 0.0925, - "macro_f1": 0.3333333432674408, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, - "routers_loss": 0.020637936890125275, + "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 @@ -3076,13 +3076,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26953125, + "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, - "routers_loss": 0.08289298415184021, + "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 @@ -3090,18 +3090,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.530672145582624, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, - "loss": 0.0823, - "macro_f1": 0.3272727429866791, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, - "routers_loss": 0.06960040330886841, + "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 @@ -3114,13 +3114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, - "loss": 0.0799, + "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, - "routers_loss": 0.02087482251226902, + "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 @@ -3133,13 +3133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, - "loss": 0.0757, + "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, - "routers_loss": 0.016592051833868027, + "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 @@ -3152,32 +3152,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.220703125, "learning_rate": 0.000662, - "loss": 0.0438, + "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, - "routers_loss": 0.012950568459928036, + "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 1.5682418550044028, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.296875, "learning_rate": 0.000666, - "loss": 0.0964, - "macro_f1": 0.29333335161209106, + "loss": 0.0963, + "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, - "routers_loss": 0.3373340964317322, + "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 @@ -3190,13 +3190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, - "routers_loss": 0.008110735565423965, + "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 @@ -3209,13 +3209,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.2421875, "learning_rate": 0.000674, - "loss": 0.0771, + "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, - "routers_loss": 0.01841609925031662, + "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 @@ -3228,13 +3228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, - "loss": 0.0894, + "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, - "routers_loss": 0.01612614095211029, + "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 @@ -3247,13 +3247,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, - "loss": 0.0611, + "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, - "routers_loss": 0.26202192902565, + "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 @@ -3266,13 +3266,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, - "loss": 0.1013, + "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, - "routers_loss": 0.09235779196023941, + "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 @@ -3285,13 +3285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.255859375, "learning_rate": 0.00069, - "loss": 0.0856, + "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, - "routers_loss": 0.010735333897173405, + "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 @@ -3304,13 +3304,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2138671875, "learning_rate": 0.000694, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, - "routers_loss": 0.14742356538772583, + "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 @@ -3323,13 +3323,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30859375, + "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, - "loss": 0.0614, + "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, - "routers_loss": 0.06606879830360413, + "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 @@ -3342,13 +3342,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.322265625, + "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, - "loss": 0.1033, + "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, - "routers_loss": 0.012873432599008083, + "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 @@ -3361,13 +3361,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, - "loss": 0.0819, + "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, - "routers_loss": 0.07853665202856064, + "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 @@ -3380,13 +3380,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.263671875, "learning_rate": 0.00071, - "loss": 0.0804, + "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, - "routers_loss": 0.2216549813747406, + "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 @@ -3399,13 +3399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1884765625, "learning_rate": 0.000714, - "loss": 0.0675, + "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, - "routers_loss": 0.02423691377043724, + "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 @@ -3413,18 +3413,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.6903434106251836, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.21484375, "learning_rate": 0.000718, - "loss": 0.0781, - "macro_f1": 0.3272727429866791, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, - "routers_loss": 0.07496294379234314, + "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 @@ -3437,13 +3437,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.197265625, "learning_rate": 0.000722, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, - "routers_loss": 0.08181872963905334, + "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 @@ -3456,13 +3456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2216796875, "learning_rate": 0.000726, - "loss": 0.1112, + "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, - "routers_loss": 0.016959719359874725, + "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 @@ -3475,13 +3475,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.15625, "learning_rate": 0.00073, - "loss": 0.0577, + "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, - "routers_loss": 0.13295969367027283, + "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 @@ -3494,13 +3494,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.150390625, "learning_rate": 0.000734, - "loss": 0.0986, + "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, - "routers_loss": 0.02476893551647663, + "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 @@ -3513,13 +3513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1796875, "learning_rate": 0.000738, - "loss": 0.0682, + "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, - "routers_loss": 0.019863395020365715, + "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 @@ -3532,13 +3532,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2412109375, "learning_rate": 0.000742, - "loss": 0.0663, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, - "routers_loss": 0.07230417430400848, + "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 @@ -3551,13 +3551,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2412109375, "learning_rate": 0.000746, - "loss": 0.0986, + "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, - "routers_loss": 0.11727793514728546, + "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 @@ -3570,13 +3570,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2265625, "learning_rate": 0.00075, - "loss": 0.0724, + "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, - "routers_loss": 0.13495951890945435, + "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 @@ -3589,13 +3589,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.2333984375, "learning_rate": 0.000754, - "loss": 0.0823, + "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, - "routers_loss": 0.07612533867359161, + "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 @@ -3608,13 +3608,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1826171875, "learning_rate": 0.000758, - "loss": 0.0803, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, - "routers_loss": 0.0484120175242424, + "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 @@ -3627,13 +3627,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1689453125, "learning_rate": 0.000762, - "loss": 0.0866, + "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, - "routers_loss": 0.10939671844244003, + "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 @@ -3646,13 +3646,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, - "loss": 0.1083, + "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, - "routers_loss": 0.11382336914539337, + "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 @@ -3667,11 +3667,11 @@ "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, - "loss": 0.0616, + "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, - "routers_loss": 0.07494530081748962, + "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 @@ -3684,13 +3684,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, - "loss": 0.0816, + "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, - "routers_loss": 0.05718417093157768, + "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 @@ -3703,13 +3703,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.2099609375, "learning_rate": 0.000778, - "loss": 0.0783, + "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, - "routers_loss": 0.2848989963531494, + "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 @@ -3722,13 +3722,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30078125, + "grad_norm": 0.30859375, "learning_rate": 0.000782, - "loss": 0.0608, + "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, - "routers_loss": 0.2050076276063919, + "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 @@ -3741,13 +3741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.29296875, "learning_rate": 0.000786, - "loss": 0.0863, + "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, - "routers_loss": 0.020946886390447617, + "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 @@ -3760,13 +3760,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.376953125, + "grad_norm": 0.37890625, "learning_rate": 0.00079, - "loss": 0.0798, + "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, - "routers_loss": 0.1270289123058319, + "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 @@ -3779,13 +3779,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, - "loss": 0.0701, + "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, - "routers_loss": 0.08012636005878448, + "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 @@ -3798,13 +3798,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, - "loss": 0.0901, + "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, - "routers_loss": 0.09315784275531769, + "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 @@ -3817,13 +3817,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, - "loss": 0.078, + "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, - "routers_loss": 0.18492189049720764, + "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 @@ -3836,13 +3836,13 @@ "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, - "loss": 0.0801, + "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, - "routers_loss": 0.32641324400901794, + "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 @@ -3855,13 +3855,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, - "loss": 0.0905, + "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, - "routers_loss": 0.02722037397325039, + "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 @@ -3874,13 +3874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, - "loss": 0.0958, + "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, - "routers_loss": 0.010129833593964577, + "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 @@ -3893,13 +3893,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2373046875, + "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, - "loss": 0.1084, + "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, - "routers_loss": 0.07298308610916138, + "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 @@ -3912,13 +3912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, - "loss": 0.0802, + "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, - "routers_loss": 0.024257874116301537, + "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 @@ -3931,13 +3931,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1650390625, "learning_rate": 0.000826, - "loss": 0.0842, + "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, - "routers_loss": 0.048864223062992096, + "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 @@ -3950,13 +3950,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1513671875, "learning_rate": 0.00083, - "loss": 0.1026, + "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, - "routers_loss": 0.1592330038547516, + "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 @@ -3969,13 +3969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.263671875, "learning_rate": 0.000834, - "loss": 0.0963, + "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, - "routers_loss": 0.02291976846754551, + "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 @@ -3988,13 +3988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10888671875, "learning_rate": 0.000838, - "loss": 0.0634, + "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, - "routers_loss": 0.010272650048136711, + "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 @@ -4007,13 +4007,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.263671875, "learning_rate": 0.000842, - "loss": 0.0786, + "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, - "routers_loss": 0.0692613497376442, + "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 @@ -4026,13 +4026,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.1318359375, "learning_rate": 0.000846, - "loss": 0.0706, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, - "routers_loss": 0.12713804841041565, + "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 @@ -4045,13 +4045,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2158203125, "learning_rate": 0.00085, - "loss": 0.0758, + "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, - "routers_loss": 0.08670130372047424, + "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 @@ -4064,13 +4064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.23828125, "learning_rate": 0.000854, - "loss": 0.0857, + "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, - "routers_loss": 0.01053862925618887, + "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 @@ -4083,13 +4083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1435546875, "learning_rate": 0.000858, - "loss": 0.0615, + "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, - "routers_loss": 0.012946994043886662, + "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 @@ -4102,13 +4102,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1552734375, "learning_rate": 0.000862, - "loss": 0.0498, + "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, - "routers_loss": 0.08222822099924088, + "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 @@ -4121,13 +4121,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.173828125, "learning_rate": 0.000866, - "loss": 0.0532, + "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, - "routers_loss": 0.07086442410945892, + "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 @@ -4140,13 +4140,13 @@ "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1943359375, "learning_rate": 0.00087, - "loss": 0.0825, + "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, - "routers_loss": 0.29007306694984436, + "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 @@ -4159,13 +4159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.396484375, + "grad_norm": 0.423828125, "learning_rate": 0.000874, - "loss": 0.0658, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, - "routers_loss": 0.014652491547167301, + "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 @@ -4178,13 +4178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.000878, - "loss": 0.0685, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, - "routers_loss": 0.013720969669520855, + "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 @@ -4197,13 +4197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.171875, "learning_rate": 0.000882, - "loss": 0.0771, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, - "routers_loss": 0.011687638238072395, + "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 @@ -4216,13 +4216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, - "loss": 0.0604, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, - "routers_loss": 0.007869532331824303, + "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 @@ -4230,18 +4230,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.0939242735544465, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, - "loss": 0.0797, - "macro_f1": 0.3076923191547394, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, - "routers_loss": 0.3034668564796448, + "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 @@ -4254,13 +4254,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.203125, "learning_rate": 0.000894, - "loss": 0.0823, + "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, - "routers_loss": 0.11066079139709473, + "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 @@ -4273,13 +4273,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.33984375, "learning_rate": 0.000898, - "loss": 0.0773, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, - "routers_loss": 0.0755370482802391, + "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 @@ -4292,13 +4292,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.3203125, "learning_rate": 0.000902, - "loss": 0.0596, + "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, - "routers_loss": 0.08470689505338669, + "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 @@ -4311,13 +4311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.1953125, "learning_rate": 0.000906, - "loss": 0.0608, + "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, - "routers_loss": 0.0130238626152277, + "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 @@ -4330,13 +4330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.00091, - "loss": 0.0652, + "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, - "routers_loss": 0.007108641788363457, + "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 @@ -4351,11 +4351,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, - "loss": 0.0746, + "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, - "routers_loss": 0.06834109872579575, + "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 @@ -4368,13 +4368,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, - "loss": 0.0733, + "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, - "routers_loss": 0.10230778902769089, + "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 @@ -4387,13 +4387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, - "loss": 0.0528, + "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, - "routers_loss": 0.009987542405724525, + "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 @@ -4406,13 +4406,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, - "loss": 0.0536, + "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, - "routers_loss": 0.03448869287967682, + "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 @@ -4425,13 +4425,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.173828125, "learning_rate": 0.00093, - "loss": 0.053, + "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, - "routers_loss": 0.13631699979305267, + "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 @@ -4444,13 +4444,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.142578125, "learning_rate": 0.000934, - "loss": 0.06, + "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, - "routers_loss": 0.053951870650053024, + "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 @@ -4463,13 +4463,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, - "loss": 0.059, + "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, - "routers_loss": 0.14479905366897583, + "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 @@ -4482,13 +4482,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.44140625, + "grad_norm": 0.5, "learning_rate": 0.000942, - "loss": 0.0913, + "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, - "routers_loss": 0.056221429258584976, + "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 @@ -4501,13 +4501,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.212890625, "learning_rate": 0.000946, - "loss": 0.0591, + "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, - "routers_loss": 0.09729792177677155, + "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 @@ -4520,13 +4520,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1259765625, "learning_rate": 0.00095, - "loss": 0.0496, + "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, - "routers_loss": 0.029447713866829872, + "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 @@ -4539,13 +4539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.271484375, + "grad_norm": 0.291015625, "learning_rate": 0.000954, - "loss": 0.0801, + "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, - "routers_loss": 0.09337342530488968, + "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 @@ -4560,11 +4560,11 @@ "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, - "loss": 0.1102, + "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, - "routers_loss": 0.23193210363388062, + "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 @@ -4572,18 +4572,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.2629879659524508, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.146484375, "learning_rate": 0.000962, - "loss": 0.0669, - "macro_f1": 0.3272727429866791, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, - "routers_loss": 0.046257760375738144, + "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 @@ -4596,13 +4596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.197265625, "learning_rate": 0.000966, - "loss": 0.0552, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, - "routers_loss": 0.01683143898844719, + "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 @@ -4615,13 +4615,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, - "loss": 0.071, + "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, - "routers_loss": 0.04129387438297272, + "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 @@ -4634,13 +4634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000974, - "loss": 0.0605, + "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, - "routers_loss": 0.01262948103249073, + "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 @@ -4653,13 +4653,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.263671875, "learning_rate": 0.000978, - "loss": 0.081, + "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, - "routers_loss": 0.07404553890228271, + "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 @@ -4672,13 +4672,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2158203125, "learning_rate": 0.000982, - "loss": 0.0751, + "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, - "routers_loss": 0.06795930862426758, + "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 @@ -4691,13 +4691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, - "loss": 0.0804, + "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, - "routers_loss": 0.02233024686574936, + "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 @@ -4710,13 +4710,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2138671875, "learning_rate": 0.00099, - "loss": 0.0731, + "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, - "routers_loss": 0.07979031652212143, + "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 @@ -4729,13 +4729,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1298828125, + "grad_norm": 0.130859375, "learning_rate": 0.000994, - "loss": 0.0795, + "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, - "routers_loss": 0.045646365731954575, + "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 @@ -4748,13 +4748,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, - "routers_loss": 0.09717849642038345, + "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 @@ -4767,13 +4767,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30078125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, - "loss": 0.0894, + "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, - "routers_loss": 0.03948225453495979, + "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 @@ -4786,13 +4786,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, - "loss": 0.0557, + "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, - "routers_loss": 0.0742638111114502, + "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 @@ -4805,13 +4805,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.25, "learning_rate": 0.000999999401247153, - "loss": 0.0682, + "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, - "routers_loss": 0.08293049037456512, + "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 @@ -4824,13 +4824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, - "loss": 0.0697, + "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, - "routers_loss": 0.010080376639962196, + "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 @@ -4843,13 +4843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, - "loss": 0.0611, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, - "routers_loss": 0.009179878048598766, + "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 @@ -4862,13 +4862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11083984375, + "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, - "loss": 0.0689, + "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, - "routers_loss": 0.006718529388308525, + "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 @@ -4881,13 +4881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, - "loss": 0.0826, + "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, - "routers_loss": 0.049344487488269806, + "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 @@ -4900,13 +4900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, - "loss": 0.0739, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, - "routers_loss": 0.013402626849710941, + "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 @@ -4919,13 +4919,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, - "loss": 0.0761, + "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, - "routers_loss": 0.16964484751224518, + "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 @@ -4938,13 +4938,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, - "loss": 0.095, + "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, - "routers_loss": 0.08609295636415482, + "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 @@ -4957,13 +4957,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2392578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, - "loss": 0.0816, + "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, - "routers_loss": 0.05354784056544304, + "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 @@ -4976,13 +4976,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.2236328125, + "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, - "loss": 0.0715, + "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, - "routers_loss": 0.09146631509065628, + "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 @@ -4995,13 +4995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, - "loss": 0.0574, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, - "routers_loss": 0.02344255894422531, + "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 @@ -5014,13 +5014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, - "loss": 0.0621, + "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, - "routers_loss": 0.018271517008543015, + "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 @@ -5033,13 +5033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, - "loss": 0.0717, + "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, - "routers_loss": 0.026990914717316628, + "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 @@ -5052,13 +5052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, - "loss": 0.0681, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, - "routers_loss": 0.019737249240279198, + "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 @@ -5071,13 +5071,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.3046875, + "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, - "loss": 0.0978, + "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, - "routers_loss": 0.212640181183815, + "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 @@ -5090,13 +5090,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, - "loss": 0.0602, + "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, - "routers_loss": 0.07302755117416382, + "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 @@ -5109,13 +5109,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, - "loss": 0.0825, + "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, - "routers_loss": 0.08667246252298355, + "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 @@ -5128,13 +5128,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, - "loss": 0.0597, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, - "routers_loss": 0.015047167427837849, + "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 @@ -5147,13 +5147,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, - "loss": 0.076, + "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, - "routers_loss": 0.07481446117162704, + "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 @@ -5166,13 +5166,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, - "loss": 0.0724, + "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, - "routers_loss": 0.049495212733745575, + "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 @@ -5185,13 +5185,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, - "loss": 0.0718, + "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, - "routers_loss": 0.08043002337217331, + "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 @@ -5204,13 +5204,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34765625, + "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, - "loss": 0.086, + "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, - "routers_loss": 0.22461950778961182, + "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 @@ -5223,13 +5223,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, - "loss": 0.0798, + "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, - "routers_loss": 0.11754962801933289, + "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 @@ -5242,13 +5242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, - "loss": 0.0978, + "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, - "routers_loss": 0.017412789165973663, + "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 @@ -5261,13 +5261,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, - "loss": 0.0792, + "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, - "routers_loss": 0.08969525247812271, + "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 @@ -5280,13 +5280,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, - "loss": 0.0803, + "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, - "routers_loss": 0.09876437485218048, + "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 @@ -5299,13 +5299,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, - "loss": 0.0887, + "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, - "routers_loss": 0.019108204171061516, + "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 @@ -5318,32 +5318,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, - "loss": 0.0573, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, - "routers_loss": 0.019048813730478287, + "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 2.638685060170238, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "f1_skip": 0.5, + "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, - "loss": 0.0947, - "macro_f1": 0.3144654333591461, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, - "routers_loss": 0.11889495700597763, + "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 @@ -5356,13 +5356,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, - "loss": 0.0771, + "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, - "routers_loss": 0.06202332302927971, + "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 @@ -5375,13 +5375,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, - "loss": 0.0623, + "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, - "routers_loss": 0.08294244855642319, + "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 @@ -5394,13 +5394,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, - "loss": 0.0885, + "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, - "routers_loss": 0.07545182853937149, + "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 @@ -5413,13 +5413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, - "loss": 0.0712, + "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, - "routers_loss": 0.008711219765245914, + "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 @@ -5432,13 +5432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, - "loss": 0.0837, + "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, - "routers_loss": 0.01639273390173912, + "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 @@ -5451,13 +5451,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, - "loss": 0.0707, + "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, - "routers_loss": 0.04997137933969498, + "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 @@ -5470,13 +5470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, - "loss": 0.0799, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, - "routers_loss": 0.011360209435224533, + "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 @@ -5489,13 +5489,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, - "loss": 0.0658, + "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, - "routers_loss": 0.31349560618400574, + "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 @@ -5508,13 +5508,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, - "loss": 0.0801, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, - "routers_loss": 0.058660347014665604, + "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 @@ -5527,13 +5527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, - "loss": 0.0578, + "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, - "routers_loss": 0.025836754590272903, + "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 @@ -5546,13 +5546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, - "loss": 0.0683, + "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, - "routers_loss": 0.016504868865013123, + "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 @@ -5565,13 +5565,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, - "loss": 0.0685, + "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, - "routers_loss": 0.1379794180393219, + "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 @@ -5584,13 +5584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, - "loss": 0.0657, + "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, - "routers_loss": 0.01802757754921913, + "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 @@ -5603,13 +5603,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, - "loss": 0.0762, + "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, - "routers_loss": 0.006902900990098715, + "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 @@ -5622,13 +5622,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, - "loss": 0.0912, + "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, - "routers_loss": 0.08348741382360458, + "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 @@ -5641,13 +5641,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, - "loss": 0.0527, + "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, - "routers_loss": 0.08290062099695206, + "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 @@ -5660,13 +5660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, - "loss": 0.0643, + "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, - "routers_loss": 0.018620988354086876, + "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 @@ -5679,13 +5679,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, - "loss": 0.073, + "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, - "routers_loss": 0.1211671382188797, + "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 @@ -5698,13 +5698,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, - "loss": 0.079, + "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, - "routers_loss": 0.15485027432441711, + "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 @@ -5717,13 +5717,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, - "loss": 0.0562, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, - "routers_loss": 0.036684274673461914, + "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 @@ -5731,18 +5731,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.835926034634576, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, - "loss": 0.0985, - "macro_f1": 0.3333333432674408, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, - "routers_loss": 0.026901578530669212, + "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 @@ -5755,13 +5755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, - "loss": 0.0632, + "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, - "routers_loss": 0.01700405217707157, + "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 @@ -5774,13 +5774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, - "loss": 0.0758, + "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, - "routers_loss": 0.015013590455055237, + "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 @@ -5793,13 +5793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, - "loss": 0.0576, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, - "routers_loss": 0.02037946693599224, + "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 @@ -5812,13 +5812,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, - "loss": 0.0648, + "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, - "routers_loss": 0.22834022343158722, + "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 @@ -5831,13 +5831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, - "loss": 0.0449, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, - "routers_loss": 0.009838113561272621, + "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 @@ -5850,13 +5850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, - "loss": 0.1009, + "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, - "routers_loss": 0.005943270865827799, + "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 @@ -5869,13 +5869,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.38671875, + "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, - "loss": 0.0941, + "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, - "routers_loss": 0.21597740054130554, + "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 @@ -5888,13 +5888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, - "loss": 0.0896, + "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, - "routers_loss": 0.023726588115096092, + "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 @@ -5907,13 +5907,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, - "routers_loss": 0.08467255532741547, + "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 @@ -5926,13 +5926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, - "loss": 0.0816, + "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, - "routers_loss": 0.029468854889273643, + "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 @@ -5945,13 +5945,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, - "loss": 0.0891, + "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, - "routers_loss": 0.09438466280698776, + "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 @@ -5964,13 +5964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, - "loss": 0.0581, + "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, - "routers_loss": 0.013679586350917816, + "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 @@ -5983,13 +5983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, - "loss": 0.0528, + "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, - "routers_loss": 0.029532987624406815, + "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 @@ -6002,13 +6002,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, - "loss": 0.0601, + "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, - "routers_loss": 0.05516733601689339, + "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 @@ -6021,13 +6021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, - "loss": 0.0785, + "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, - "routers_loss": 0.010254866443574429, + "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 @@ -6040,13 +6040,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, - "loss": 0.0518, + "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, - "routers_loss": 0.07528360933065414, + "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 @@ -6059,13 +6059,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1064453125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, - "loss": 0.0844, + "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, - "routers_loss": 0.04301584139466286, + "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 @@ -6078,13 +6078,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, - "loss": 0.0699, + "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, - "routers_loss": 0.14521080255508423, + "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 @@ -6097,13 +6097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, - "loss": 0.0543, + "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, - "routers_loss": 0.01074797473847866, + "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 @@ -6116,13 +6116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, - "loss": 0.0659, + "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, - "routers_loss": 0.009271817281842232, + "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 @@ -6135,13 +6135,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, - "loss": 0.0737, + "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, - "routers_loss": 0.10257050395011902, + "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 @@ -6154,13 +6154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, - "loss": 0.0363, + "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, - "routers_loss": 0.07091924548149109, + "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 @@ -6173,13 +6173,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, - "loss": 0.0421, + "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, - "routers_loss": 0.034446243196725845, + "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 @@ -6192,13 +6192,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, - "loss": 0.0593, + "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, - "routers_loss": 0.06077485531568527, + "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 @@ -6211,13 +6211,13 @@ "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2294921875, + "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, - "loss": 0.0537, + "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, - "routers_loss": 0.2382282167673111, + "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 @@ -6230,13 +6230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, - "loss": 0.0613, + "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, - "routers_loss": 0.011971636675298214, + "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 @@ -6249,13 +6249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.212890625, + "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, - "loss": 0.0414, + "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, - "routers_loss": 0.010221127420663834, + "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 @@ -6268,13 +6268,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2265625, + "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, - "loss": 0.061, + "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, - "routers_loss": 0.11860335618257523, + "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 @@ -6287,13 +6287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, - "loss": 0.0485, + "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, - "routers_loss": 0.011139829643070698, + "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 @@ -6306,13 +6306,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, - "loss": 0.0478, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, - "routers_loss": 0.03978770971298218, + "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 @@ -6327,11 +6327,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, - "loss": 0.0481, + "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, - "routers_loss": 0.051231011748313904, + "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 @@ -6344,13 +6344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, - "loss": 0.0615, + "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, - "routers_loss": 0.024917088449001312, + "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 @@ -6363,13 +6363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, - "loss": 0.0627, + "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, - "routers_loss": 0.008834881708025932, + "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 @@ -6382,13 +6382,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, - "routers_loss": 0.033405229449272156, + "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 @@ -6401,13 +6401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, - "loss": 0.0523, + "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, - "routers_loss": 0.020753704011440277, + "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 @@ -6420,13 +6420,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, - "loss": 0.0698, + "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, - "routers_loss": 0.06932353973388672, + "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 @@ -6439,13 +6439,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, - "loss": 0.059, + "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, - "routers_loss": 0.032903749495744705, + "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 @@ -6458,32 +6458,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.2099609375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, - "loss": 0.0417, + "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, - "routers_loss": 0.19733747839927673, + "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.2019371881420606, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.154296875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, - "loss": 0.0729, - "macro_f1": 0.6666666865348816, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, - "routers_loss": 0.013452666811645031, + "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 @@ -6496,13 +6496,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, - "routers_loss": 0.05302857980132103, + "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 @@ -6515,13 +6515,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, - "loss": 0.0527, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, - "routers_loss": 0.08124994486570358, + "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 @@ -6534,13 +6534,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, - "loss": 0.0579, + "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, - "routers_loss": 0.058633625507354736, + "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 @@ -6553,13 +6553,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, - "loss": 0.0533, + "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, - "routers_loss": 0.04703643172979355, + "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 @@ -6572,13 +6572,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, - "loss": 0.0485, + "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, - "routers_loss": 0.11615128815174103, + "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 @@ -6591,13 +6591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, - "loss": 0.0718, + "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, - "routers_loss": 0.017403846606612206, + "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 @@ -6610,13 +6610,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, - "loss": 0.0444, + "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, - "routers_loss": 0.07067303359508514, + "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 @@ -6629,13 +6629,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, - "loss": 0.0527, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, - "routers_loss": 0.07131028175354004, + "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 @@ -6648,13 +6648,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, - "loss": 0.0629, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, - "routers_loss": 0.1152748316526413, + "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 @@ -6667,13 +6667,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, - "loss": 0.0663, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, - "routers_loss": 0.0077212234027683735, + "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 @@ -6686,13 +6686,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.099609375, + "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, - "loss": 0.0494, + "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, - "routers_loss": 0.02726336568593979, + "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 @@ -6705,13 +6705,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.32421875, + "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, - "loss": 0.0615, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, - "routers_loss": 0.0958542674779892, + "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 @@ -6724,13 +6724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, - "loss": 0.0583, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, - "routers_loss": 0.007100600749254227, + "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 @@ -6743,13 +6743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, - "loss": 0.0445, + "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, - "routers_loss": 0.0047812811098992825, + "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 @@ -6762,13 +6762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, - "loss": 0.052, + "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, - "routers_loss": 0.006643407512456179, + "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 @@ -6781,13 +6781,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2890625, + "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, - "loss": 0.0719, + "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, - "routers_loss": 0.0910436138510704, + "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 @@ -6800,13 +6800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, - "loss": 0.0649, + "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, - "routers_loss": 0.010978946462273598, + "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 @@ -6819,13 +6819,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, - "loss": 0.0543, + "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, - "routers_loss": 0.009956461377441883, + "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 @@ -6838,13 +6838,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, - "loss": 0.0412, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, - "routers_loss": 0.057210199534893036, + "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 @@ -6857,32 +6857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, - "loss": 0.0364, + "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, - "routers_loss": 0.01035996899008751, + "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.3991781626063986, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, - "loss": 0.0661, - "macro_f1": 0.3076923191547394, + "loss": 0.063, + "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, - "routers_loss": 0.18087820708751678, + "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 @@ -6895,32 +6895,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, - "loss": 0.0505, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, - "routers_loss": 0.04720238968729973, + "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.417963017317288, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.2216796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, - "loss": 0.0603, - "macro_f1": 0.6666666865348816, + "loss": 0.06, + "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, - "routers_loss": 0.015407778322696686, + "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 @@ -6933,13 +6933,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, - "loss": 0.0489, + "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, - "routers_loss": 0.060891781002283096, + "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 @@ -6947,18 +6947,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.436747872028177, - "f1_execute": 0.943396270275116, + "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, - "loss": 0.0825, - "macro_f1": 0.3144654333591461, + "loss": 0.0796, + "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, - "routers_loss": 0.1661442220211029, + "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 @@ -6966,18 +6966,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.446140299383622, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, - "loss": 0.0634, - "macro_f1": 0.3333333432674408, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, - "routers_loss": 0.02108248695731163, + "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 @@ -6990,13 +6990,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, - "loss": 0.0534, + "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, - "routers_loss": 0.08318125456571579, + "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 @@ -7009,13 +7009,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, - "loss": 0.0514, + "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, - "routers_loss": 0.015889234840869904, + "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 @@ -7028,13 +7028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, - "routers_loss": 0.01378484908491373, + "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 @@ -7047,13 +7047,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, - "loss": 0.076, + "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, - "routers_loss": 0.02673683874309063, + "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 @@ -7066,13 +7066,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, - "loss": 0.0778, + "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, - "routers_loss": 0.27776041626930237, + "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 @@ -7085,13 +7085,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, - "loss": 0.0575, + "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, - "routers_loss": 0.0575483962893486, + "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 @@ -7104,13 +7104,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, - "loss": 0.0478, + "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, - "routers_loss": 0.0458797849714756, + "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 @@ -7123,13 +7123,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, - "loss": 0.0701, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, - "routers_loss": 0.07850238680839539, + "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 @@ -7142,13 +7142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, - "loss": 0.0702, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, - "routers_loss": 0.009507967159152031, + "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 @@ -7161,13 +7161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, - "loss": 0.0543, + "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, - "routers_loss": 0.02620997279882431, + "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 @@ -7180,13 +7180,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, - "loss": 0.0791, + "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, - "routers_loss": 0.02798631228506565, + "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 @@ -7201,11 +7201,11 @@ "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, - "loss": 0.046, + "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, - "routers_loss": 0.16614431142807007, + "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 @@ -7218,13 +7218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.193359375, + "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, - "loss": 0.0669, + "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, - "routers_loss": 0.008541896007955074, + "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 @@ -7237,13 +7237,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, - "loss": 0.0478, + "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, - "routers_loss": 0.045411624014377594, + "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 @@ -7256,13 +7256,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.380859375, + "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, - "loss": 0.0689, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, - "routers_loss": 0.052299100905656815, + "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 @@ -7275,13 +7275,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, - "loss": 0.0602, + "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, - "routers_loss": 0.09140212833881378, + "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 @@ -7294,13 +7294,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.201171875, + "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, - "loss": 0.0475, + "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, - "routers_loss": 0.030750583857297897, + "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 @@ -7313,13 +7313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, - "loss": 0.052, + "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, - "routers_loss": 0.010202950797975063, + "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 @@ -7332,13 +7332,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, - "loss": 0.0434, + "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, - "routers_loss": 0.07364192605018616, + "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 @@ -7353,11 +7353,11 @@ "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, - "loss": 0.0606, + "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, - "routers_loss": 0.06553081423044205, + "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 @@ -7370,13 +7370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, - "loss": 0.0475, + "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, - "routers_loss": 0.008751659654080868, + "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 @@ -7389,13 +7389,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.12060546875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, - "loss": 0.0777, + "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, - "routers_loss": 0.24522721767425537, + "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 @@ -7408,13 +7408,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, - "routers_loss": 0.03767623379826546, + "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 @@ -7427,13 +7427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, - "loss": 0.0708, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, - "routers_loss": 0.006098059006035328, + "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 @@ -7446,13 +7446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, - "loss": 0.0589, + "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, - "routers_loss": 0.01731623336672783, + "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 @@ -7465,13 +7465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, - "loss": 0.0526, + "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, - "routers_loss": 0.0076471962966024876, + "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 @@ -7484,13 +7484,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, - "loss": 0.0745, + "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, - "routers_loss": 0.0637628585100174, + "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 @@ -7503,13 +7503,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, - "loss": 0.0699, + "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, - "routers_loss": 0.10882514715194702, + "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 @@ -7522,13 +7522,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, - "loss": 0.056, + "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, - "routers_loss": 0.10924118757247925, + "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 @@ -7541,13 +7541,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, - "loss": 0.0664, + "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, - "routers_loss": 0.06571807712316513, + "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 @@ -7560,13 +7560,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, - "loss": 0.0616, + "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, - "routers_loss": 0.058966971933841705, + "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 @@ -7579,32 +7579,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, - "loss": 0.067, + "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, - "routers_loss": 0.04844636470079422, + "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, - "acc_skip": 0.4000000059604645, - "avg_layers": 26.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, "epoch": 3.756090402113296, - "f1_execute": 0.9166666865348816, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.5714285969734192, - "grad_norm": 0.1416015625, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, - "loss": 0.0634, - "macro_f1": 0.4960317611694336, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, - "routers_loss": 0.1591777801513672, + "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 @@ -7617,13 +7617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, - "loss": 0.0694, + "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, - "routers_loss": 0.017735568806529045, + "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 @@ -7636,13 +7636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, - "loss": 0.0477, + "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, - "routers_loss": 0.012401525862514973, + "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 @@ -7655,13 +7655,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, - "loss": 0.0735, + "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, - "routers_loss": 0.10736164450645447, + "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 @@ -7674,13 +7674,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, - "loss": 0.0428, + "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, - "routers_loss": 0.0595436617732048, + "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 @@ -7693,13 +7693,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.240234375, + "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, - "loss": 0.0494, + "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, - "routers_loss": 0.12617000937461853, + "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 @@ -7712,13 +7712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, - "loss": 0.0537, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, - "routers_loss": 0.021242506802082062, + "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 @@ -7731,13 +7731,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, - "loss": 0.0617, + "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, - "routers_loss": 0.10387415438890457, + "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 @@ -7750,13 +7750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, - "loss": 0.0565, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, - "routers_loss": 0.007816939614713192, + "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 @@ -7769,13 +7769,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, - "loss": 0.0654, + "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, - "routers_loss": 0.22526368498802185, + "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 @@ -7788,13 +7788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, - "loss": 0.056, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, - "routers_loss": 0.010998851619660854, + "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 @@ -7807,13 +7807,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, - "loss": 0.0712, + "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, - "routers_loss": 0.07115054875612259, + "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 @@ -7826,13 +7826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, - "loss": 0.0611, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, - "routers_loss": 0.008062695153057575, + "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 @@ -7845,13 +7845,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, - "loss": 0.0496, + "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, - "routers_loss": 0.16666285693645477, + "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 @@ -7864,13 +7864,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, - "loss": 0.0567, + "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, - "routers_loss": 0.0908689796924591, + "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 @@ -7883,13 +7883,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, - "loss": 0.0648, + "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, - "routers_loss": 0.04364728182554245, + "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 @@ -7897,18 +7897,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.906369239800411, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, - "loss": 0.0772, - "macro_f1": 0.3076923191547394, - "num_tokens": 1344232.0, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, "repeat_count": 1.0, - "routers_loss": 0.15315109491348267, + "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 @@ -7921,13 +7921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, - "loss": 0.0527, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, - "routers_loss": 0.01342768594622612, + "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 @@ -7940,13 +7940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, - "loss": 0.0513, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, - "routers_loss": 0.01158806961029768, + "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 @@ -7959,13 +7959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, - "loss": 0.0549, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, - "routers_loss": 0.01255605649203062, + "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 @@ -7978,13 +7978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, - "loss": 0.0578, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, - "routers_loss": 0.010225459933280945, + "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 @@ -7997,13 +7997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, - "loss": 0.0633, + "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, - "routers_loss": 0.007837744429707527, + "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 @@ -8016,13 +8016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, - "loss": 0.0674, + "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, - "routers_loss": 0.008631376549601555, + "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 @@ -8035,13 +8035,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, - "loss": 0.0612, + "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, - "routers_loss": 0.06206027418375015, + "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 @@ -8049,18 +8049,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 3.9815086586439685, - "f1_execute": 0.9411765336990356, + "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.16015625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, - "loss": 0.0678, - "macro_f1": 0.480392187833786, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, - "routers_loss": 0.1463794708251953, + "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 @@ -8073,13 +8073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, - "loss": 0.052, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, - "routers_loss": 0.01140492781996727, + "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 @@ -8092,13 +8092,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, - "loss": 0.0563, + "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, - "routers_loss": 0.05136030167341232, + "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 @@ -8111,32 +8111,32 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, - "loss": 0.0627, + "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, - "routers_loss": 0.07274381071329117, + "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 4.01878485471089, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, - "loss": 0.0503, - "macro_f1": 0.3272727429866791, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, - "routers_loss": 0.024335317313671112, + "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 @@ -8149,13 +8149,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, - "loss": 0.0359, + "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, - "routers_loss": 0.046614740043878555, + "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 @@ -8168,13 +8168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, - "loss": 0.0372, + "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, - "routers_loss": 0.006380240898579359, + "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 @@ -8187,13 +8187,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, - "loss": 0.0473, + "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, - "routers_loss": 0.04770716652274132, + "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 @@ -8206,32 +8206,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, - "loss": 0.0434, + "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, - "routers_loss": 0.006983708590269089, + "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.065746991488113, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.080078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, - "loss": 0.0476, - "macro_f1": 0.32098764181137085, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, - "routers_loss": 0.046313900500535965, + "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 @@ -8244,32 +8244,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, - "routers_loss": 0.0401870422065258, + "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.084531846199002, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, - "loss": 0.0505, - "macro_f1": 0.32098764181137085, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, - "routers_loss": 0.03313451260328293, + "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 @@ -8282,13 +8282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, - "loss": 0.0468, + "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, - "routers_loss": 0.010118982754647732, + "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 @@ -8301,13 +8301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, - "loss": 0.0498, + "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, - "routers_loss": 0.005856200121343136, + "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 @@ -8320,13 +8320,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, - "loss": 0.0417, + "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, - "routers_loss": 0.02768322452902794, + "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 @@ -8339,13 +8339,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, - "loss": 0.0419, + "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, - "routers_loss": 0.09382032603025436, + "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 @@ -8358,32 +8358,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.103515625, + "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, - "loss": 0.0466, + "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, - "routers_loss": 0.026867631822824478, + "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 4.14088641033167, - "f1_execute": 0.95652174949646, + "f1_execute": 0.936170220375061, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.26171875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, - "loss": 0.0496, - "macro_f1": 0.5855072736740112, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, - "routers_loss": 0.12731307744979858, + "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 @@ -8396,13 +8396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, - "loss": 0.039, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, - "routers_loss": 0.008483970537781715, + "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 @@ -8415,13 +8415,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, - "loss": 0.0348, + "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, - "routers_loss": 0.07847871631383896, + "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 @@ -8434,13 +8434,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, - "loss": 0.036, + "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, - "routers_loss": 0.04574459046125412, + "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 @@ -8453,13 +8453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, - "loss": 0.0485, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, - "routers_loss": 0.004683624487370253, + "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 @@ -8472,13 +8472,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, - "loss": 0.0476, + "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, - "routers_loss": 0.06499828398227692, + "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 @@ -8486,18 +8486,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 4.197240974464338, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, - "loss": 0.0326, - "macro_f1": 0.44705885648727417, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, - "routers_loss": 0.08285653591156006, + "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 @@ -8510,13 +8510,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, - "loss": 0.0497, + "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, - "routers_loss": 0.04252336546778679, + "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 @@ -8524,18 +8524,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.216025829175227, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, - "loss": 0.0349, - "macro_f1": 0.31446540355682373, + "loss": 0.034, + "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, - "routers_loss": 0.126711905002594, + "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 @@ -8548,13 +8548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, - "loss": 0.0392, + "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, - "routers_loss": 0.00955706462264061, + "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 @@ -8567,13 +8567,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, - "loss": 0.0377, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, - "routers_loss": 0.03127318620681763, + "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 @@ -8586,13 +8586,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, - "loss": 0.0389, + "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, - "routers_loss": 0.06743519753217697, + "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 @@ -8605,13 +8605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, - "loss": 0.0477, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, - "routers_loss": 0.0025313226506114006, + "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 @@ -8624,13 +8624,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, - "loss": 0.052, + "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, - "routers_loss": 0.037147488445043564, + "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 @@ -8643,13 +8643,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, - "loss": 0.0501, + "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, - "routers_loss": 0.021232586354017258, + "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 @@ -8662,13 +8662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, - "loss": 0.0759, + "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, - "routers_loss": 0.010563847608864307, + "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 @@ -8681,13 +8681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, - "loss": 0.0425, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, - "routers_loss": 0.0267612524330616, + "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 @@ -8700,13 +8700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, - "loss": 0.0501, + "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, - "routers_loss": 0.005838244222104549, + "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 @@ -8719,13 +8719,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, - "loss": 0.0383, + "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, - "routers_loss": 0.014642171561717987, + "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 @@ -8738,32 +8738,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, - "loss": 0.0457, + "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, - "routers_loss": 0.04119620472192764, + "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.328734957440563, - "f1_execute": 0.943396270275116, - "f1_repeat": 0.0, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, - "loss": 0.0433, - "macro_f1": 0.3144654333591461, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, - "routers_loss": 0.06713195145130157, + "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 @@ -8776,13 +8776,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1572265625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, - "loss": 0.0533, + "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, - "routers_loss": 0.024023180827498436, + "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 @@ -8795,13 +8795,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, - "loss": 0.0373, + "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, - "routers_loss": 0.05399841442704201, + "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 @@ -8814,13 +8814,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, - "loss": 0.0488, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, - "routers_loss": 0.0299264844506979, + "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 @@ -8833,13 +8833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, - "loss": 0.0467, + "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, - "routers_loss": 0.023478010669350624, + "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 @@ -8852,13 +8852,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.1103515625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, - "loss": 0.0447, + "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, - "routers_loss": 0.12116194516420364, + "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 @@ -8871,13 +8871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, - "loss": 0.053, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, - "routers_loss": 0.011879723519086838, + "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 @@ -8890,13 +8890,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, - "loss": 0.0373, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, - "routers_loss": 0.009245929308235645, + "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 @@ -8909,13 +8909,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, - "loss": 0.0461, + "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, - "routers_loss": 0.032464127987623215, + "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 @@ -8928,13 +8928,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, - "loss": 0.0515, + "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, - "routers_loss": 0.08835392445325851, + "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 @@ -8947,13 +8947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, - "loss": 0.0405, + "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, - "routers_loss": 0.014307246543467045, + "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 @@ -8966,13 +8966,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, - "loss": 0.0449, + "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, - "routers_loss": 0.10261563211679459, + "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 @@ -8985,13 +8985,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, - "loss": 0.0507, + "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, - "routers_loss": 0.03316422924399376, + "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 @@ -9004,13 +9004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, - "loss": 0.0352, + "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, - "routers_loss": 0.00902469176799059, + "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 @@ -9023,13 +9023,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, - "loss": 0.0581, + "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, - "routers_loss": 0.06710167229175568, + "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 @@ -9042,13 +9042,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, - "loss": 0.0294, + "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, - "routers_loss": 0.04208769276738167, + "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 @@ -9061,13 +9061,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, - "loss": 0.0505, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, - "routers_loss": 0.06530380249023438, + "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 @@ -9080,13 +9080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, - "loss": 0.0545, + "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, - "routers_loss": 0.01803453080356121, + "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 @@ -9099,13 +9099,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, - "loss": 0.0481, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, - "routers_loss": 0.08461762219667435, + "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 @@ -9120,11 +9120,11 @@ "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, - "loss": 0.0441, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, - "routers_loss": 0.007111486047506332, + "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 @@ -9137,13 +9137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11181640625, + "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, - "loss": 0.0361, + "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, - "routers_loss": 0.029776185750961304, + "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 @@ -9156,13 +9156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, - "loss": 0.0506, + "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, - "routers_loss": 0.007108999416232109, + "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 @@ -9175,13 +9175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, - "loss": 0.0498, + "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, - "routers_loss": 0.01126947533339262, + "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 @@ -9194,13 +9194,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, - "loss": 0.0366, + "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, - "routers_loss": 0.05142999067902565, + "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 @@ -9213,13 +9213,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, - "loss": 0.0461, + "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, - "routers_loss": 0.010770819149911404, + "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 @@ -9232,13 +9232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, - "loss": 0.0692, + "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, - "routers_loss": 0.008775795809924603, + "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 @@ -9251,13 +9251,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, - "loss": 0.0403, + "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, - "routers_loss": 0.05100395902991295, + "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 @@ -9270,13 +9270,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25390625, + "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, - "loss": 0.0503, + "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, - "routers_loss": 0.06653711199760437, + "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 @@ -9284,18 +9284,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.591722923393014, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, - "loss": 0.0435, - "macro_f1": 0.3333333432674408, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, - "routers_loss": 0.009865665808320045, + "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 @@ -9308,13 +9308,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.130859375, + "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, - "loss": 0.0399, + "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, - "routers_loss": 0.021175632253289223, + "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 @@ -9327,32 +9327,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, - "loss": 0.0472, + "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, - "routers_loss": 0.011803832836449146, + "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.800000011920929, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 4.6199002054593485, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.0, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, - "grad_norm": 0.142578125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, - "loss": 0.0467, - "macro_f1": 0.5696970224380493, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, - "routers_loss": 0.08856551349163055, + "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 @@ -9365,13 +9365,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, - "loss": 0.0413, + "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, - "routers_loss": 0.08593414723873138, + "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 @@ -9384,13 +9384,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, - "loss": 0.0399, + "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, - "routers_loss": 0.07049509882926941, + "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 @@ -9403,13 +9403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, - "loss": 0.0595, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, - "routers_loss": 0.0019258069805800915, + "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 @@ -9422,13 +9422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, - "loss": 0.0335, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, - "routers_loss": 0.014094089157879353, + "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 @@ -9436,18 +9436,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 31.0, "epoch": 4.666862342236572, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, - "loss": 0.0603, - "macro_f1": 0.6527777910232544, + "loss": 0.06, + "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, - "routers_loss": 0.06360147893428802, + "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 @@ -9460,13 +9460,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10546875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, - "loss": 0.0573, + "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, - "routers_loss": 0.0326208658516407, + "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 @@ -9479,13 +9479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, - "loss": 0.0502, + "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, - "routers_loss": 0.012660752050578594, + "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 @@ -9498,13 +9498,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, - "loss": 0.0537, + "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, - "routers_loss": 0.021756287664175034, + "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 @@ -9512,37 +9512,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.70443205165835, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, - "loss": 0.0447, - "macro_f1": 0.31446540355682373, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, - "routers_loss": 0.07292548567056656, + "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { - "acc_repeat": 0.5, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 4.713824479013795, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, - "loss": 0.0505, - "macro_f1": 0.5492662787437439, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, - "routers_loss": 0.03397528454661369, + "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 @@ -9555,13 +9555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, - "loss": 0.0544, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, - "routers_loss": 0.005987613927572966, + "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 @@ -9574,13 +9574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, - "loss": 0.0522, + "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, - "routers_loss": 0.021656684577465057, + "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 @@ -9593,32 +9593,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, - "loss": 0.0487, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, - "routers_loss": 0.0014992234064266086, + "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.751394188435574, - "f1_execute": 0.9411764740943909, - "f1_repeat": 0.0, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, - "loss": 0.0491, - "macro_f1": 0.5359477400779724, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, - "routers_loss": 0.03448791801929474, + "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 @@ -9631,13 +9631,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, - "loss": 0.0541, + "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, - "routers_loss": 0.08285929262638092, + "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 @@ -9650,13 +9650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, - "loss": 0.0515, + "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, - "routers_loss": 0.00486504752188921, + "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 @@ -9669,13 +9669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, - "loss": 0.051, + "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, - "routers_loss": 0.017805909737944603, + "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 @@ -9688,13 +9688,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, - "loss": 0.0547, + "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, - "routers_loss": 0.12958799302577972, + "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 @@ -9707,13 +9707,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1533203125, + "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, - "loss": 0.0595, + "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, - "routers_loss": 0.07447081059217453, + "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 @@ -9726,13 +9726,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, - "loss": 0.0619, + "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, - "routers_loss": 0.12529553472995758, + "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 @@ -9745,13 +9745,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, - "loss": 0.0491, + "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, - "routers_loss": 0.08664281666278839, + "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 @@ -9764,13 +9764,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, - "loss": 0.0394, + "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, - "routers_loss": 0.0067965323105454445, + "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 @@ -9783,13 +9783,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, - "loss": 0.0607, + "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, - "routers_loss": 0.08319470286369324, + "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 @@ -9802,13 +9802,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, - "loss": 0.0476, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, - "routers_loss": 0.002826537238433957, + "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 @@ -9823,11 +9823,11 @@ "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, - "loss": 0.0431, + "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, - "routers_loss": 0.03491095453500748, + "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 @@ -9840,13 +9840,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, - "loss": 0.0616, + "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, - "routers_loss": 0.1828247457742691, + "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 @@ -9859,13 +9859,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, - "loss": 0.0563, + "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, - "routers_loss": 0.010931054130196571, + "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 @@ -9878,13 +9878,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, - "loss": 0.0569, + "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, - "routers_loss": 0.023222090676426888, + "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 @@ -9897,13 +9897,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, - "loss": 0.0453, + "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, - "routers_loss": 0.07085686922073364, + "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 @@ -9916,13 +9916,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, - "loss": 0.0515, + "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, - "routers_loss": 0.010158216580748558, + "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 @@ -9935,13 +9935,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, - "loss": 0.0372, + "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, - "routers_loss": 0.007876895368099213, + "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 @@ -9954,13 +9954,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, - "loss": 0.0501, + "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, - "routers_loss": 0.004859173204749823, + "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 @@ -9973,13 +9973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, - "loss": 0.0582, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, - "routers_loss": 0.01798083633184433, + "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 @@ -9987,18 +9987,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.939242735544467, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, - "macro_f1": 0.3272727429866791, + "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, - "routers_loss": 0.014295363798737526, + "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 @@ -10011,13 +10011,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, - "loss": 0.0635, + "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, - "routers_loss": 0.1038367822766304, + "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 @@ -10030,13 +10030,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, - "loss": 0.057, + "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, - "routers_loss": 0.029780643060803413, + "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 @@ -10049,13 +10049,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, - "loss": 0.0398, + "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, - "routers_loss": 0.008537676185369492, + "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 @@ -10068,13 +10068,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, - "loss": 0.0617, + "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, - "routers_loss": 0.03966755419969559, + "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 @@ -10087,13 +10087,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, - "loss": 0.0311, + "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, - "routers_loss": 0.06651833653450012, + "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 @@ -10106,13 +10106,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2431640625, + "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, - "loss": 0.0557, + "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, - "routers_loss": 0.05130369961261749, + "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 @@ -10125,13 +10125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, - "loss": 0.0435, + "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, - "routers_loss": 0.020534176379442215, + "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 @@ -10144,13 +10144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, - "loss": 0.0305, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, - "routers_loss": 0.007514918688684702, + "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 @@ -10163,13 +10163,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, - "loss": 0.0461, + "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, - "routers_loss": 0.024598751217126846, + "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 @@ -10182,32 +10182,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, - "loss": 0.0408, + "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, - "routers_loss": 0.011866633780300617, + "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.042265923099501, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, - "loss": 0.036, - "macro_f1": 0.3272727429866791, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, - "routers_loss": 0.01755746826529503, + "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 @@ -10220,13 +10220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, - "loss": 0.0415, + "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, - "routers_loss": 0.003976983483880758, + "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 @@ -10239,13 +10239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, - "loss": 0.0378, + "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, - "routers_loss": 0.013548235408961773, + "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 @@ -10258,13 +10258,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, - "routers_loss": 0.03923165053129196, + "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 @@ -10277,13 +10277,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.1923828125, + "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, - "loss": 0.0464, + "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, - "routers_loss": 0.046429626643657684, + "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 @@ -10296,13 +10296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, - "loss": 0.0346, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, - "routers_loss": 0.008998732082545757, + "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 @@ -10315,13 +10315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, - "loss": 0.0386, + "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, - "routers_loss": 0.005173585377633572, + "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 @@ -10334,13 +10334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, - "loss": 0.0308, + "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, - "routers_loss": 0.024098891764879227, + "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 @@ -10353,13 +10353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, - "routers_loss": 0.014002764597535133, + "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 @@ -10372,32 +10372,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.14453125, + "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, - "loss": 0.0462, + "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, - "routers_loss": 0.017871137708425522, + "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.136190196653947, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, - "macro_f1": 0.32098764181137085, + "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, - "routers_loss": 0.033123619854450226, + "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 @@ -10410,13 +10410,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, - "loss": 0.0267, + "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, - "routers_loss": 0.01279227901250124, + "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 @@ -10429,13 +10429,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, - "loss": 0.0295, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, - "routers_loss": 0.001354650012217462, + "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 @@ -10448,13 +10448,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, - "loss": 0.0326, + "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, - "routers_loss": 0.1097714751958847, + "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 @@ -10467,13 +10467,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, - "loss": 0.0187, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, - "routers_loss": 0.009649592451751232, + "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 @@ -10486,13 +10486,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11083984375, + "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, - "loss": 0.0447, + "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, - "routers_loss": 0.020858706906437874, + "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 @@ -10505,13 +10505,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, - "loss": 0.0428, + "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, - "routers_loss": 0.048101235181093216, + "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 @@ -10524,13 +10524,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, - "loss": 0.0435, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, - "routers_loss": 0.02875671721994877, + "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 @@ -10543,13 +10543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, - "loss": 0.0247, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, - "routers_loss": 0.019005145877599716, + "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 @@ -10562,13 +10562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, - "loss": 0.0393, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, - "routers_loss": 0.007238700054585934, + "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 @@ -10581,13 +10581,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, - "routers_loss": 0.12206140905618668, + "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 @@ -10595,18 +10595,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 5.239506897563839, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, - "loss": 0.0366, - "macro_f1": 0.29333335161209106, + "loss": 0.035, + "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, - "routers_loss": 0.15721899271011353, + "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 @@ -10619,32 +10619,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.125, + "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, - "loss": 0.0366, + "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, - "routers_loss": 0.05058665946125984, + "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 5.258291752274729, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "f1_skip": 1.0, + "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, - "loss": 0.0454, - "macro_f1": 0.32098764181137085, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, - "routers_loss": 0.023021472617983818, + "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 @@ -10657,13 +10657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, - "loss": 0.0409, + "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, - "routers_loss": 0.006660689599812031, + "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 @@ -10676,13 +10676,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.146484375, + "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, - "loss": 0.0547, + "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, - "routers_loss": 0.031727343797683716, + "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 @@ -10695,13 +10695,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, - "loss": 0.0351, + "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, - "routers_loss": 0.06140992045402527, + "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 @@ -10709,18 +10709,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 5.295861461696507, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, - "loss": 0.0436, - "macro_f1": 0.44705885648727417, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, - "routers_loss": 0.03872275352478027, + "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 @@ -10728,18 +10728,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.305253889051952, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, - "loss": 0.0353, - "macro_f1": 0.3272727429866791, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, - "routers_loss": 0.031013142317533493, + "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 @@ -10752,13 +10752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, - "loss": 0.0333, + "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, - "routers_loss": 0.004357635974884033, + "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 @@ -10771,13 +10771,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11279296875, + "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, - "routers_loss": 0.2376353144645691, + "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 @@ -10790,13 +10790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, - "loss": 0.0446, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, - "routers_loss": 0.004978097043931484, + "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 @@ -10809,13 +10809,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, - "loss": 0.0309, + "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, - "routers_loss": 0.14195409417152405, + "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 @@ -10828,13 +10828,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.15234375, + "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, - "loss": 0.0403, + "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, - "routers_loss": 0.04005253314971924, + "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 @@ -10847,13 +10847,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, - "routers_loss": 0.006839688867330551, + "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 @@ -10866,13 +10866,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, - "loss": 0.0396, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, - "routers_loss": 0.10174567997455597, + "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 @@ -10885,13 +10885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, - "loss": 0.0365, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, - "routers_loss": 0.014655748382210732, + "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 @@ -10904,13 +10904,13 @@ "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, - "loss": 0.0467, + "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, - "routers_loss": 0.16960746049880981, + "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 @@ -10923,32 +10923,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, - "loss": 0.0399, + "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, - "routers_loss": 0.011591178365051746, + "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { - "acc_repeat": 0.5, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 30.0, "epoch": 5.408570589961843, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, - "loss": 0.0332, - "macro_f1": 0.5492662787437439, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, - "routers_loss": 0.04036068916320801, + "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 @@ -10961,13 +10961,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, - "loss": 0.0366, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, - "routers_loss": 0.030165785923600197, + "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 @@ -10980,13 +10980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, - "loss": 0.0421, + "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, - "routers_loss": 0.003615695284679532, + "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 @@ -10999,13 +10999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, - "loss": 0.0377, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, - "routers_loss": 0.01679840311408043, + "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 @@ -11018,13 +11018,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, - "loss": 0.0422, + "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, - "routers_loss": 0.024936161935329437, + "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 @@ -11037,13 +11037,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, - "loss": 0.0407, + "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, - "routers_loss": 0.06472968310117722, + "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 @@ -11056,13 +11056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, - "loss": 0.031, + "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, - "routers_loss": 0.009633494541049004, + "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 @@ -11075,13 +11075,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, - "routers_loss": 0.01458993274718523, + "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 @@ -11094,13 +11094,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, - "loss": 0.0389, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, - "routers_loss": 0.06636982411146164, + "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 @@ -11113,13 +11113,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, - "loss": 0.0325, + "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, - "routers_loss": 0.005644182674586773, + "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 @@ -11132,13 +11132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, - "loss": 0.0402, + "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, - "routers_loss": 0.010273848660290241, + "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 @@ -11151,13 +11151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, - "loss": 0.0415, + "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, - "routers_loss": 0.004529652185738087, + "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 @@ -11170,13 +11170,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, - "loss": 0.045, + "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, - "routers_loss": 0.024671228602528572, + "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 @@ -11189,13 +11189,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, - "loss": 0.0354, + "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, - "routers_loss": 0.06466450542211533, + "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 @@ -11208,13 +11208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, - "loss": 0.0278, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, - "routers_loss": 0.010566026903688908, + "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 @@ -11227,13 +11227,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, - "loss": 0.037, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, - "routers_loss": 0.013842248357832432, + "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 @@ -11246,13 +11246,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07373046875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, - "loss": 0.0215, + "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, - "routers_loss": 0.122759610414505, + "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 @@ -11265,32 +11265,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, - "loss": 0.0402, + "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, - "routers_loss": 0.035315629094839096, + "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.577634282359847, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, - "loss": 0.034, - "macro_f1": 0.32098764181137085, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, - "routers_loss": 0.040048226714134216, + "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 @@ -11298,18 +11298,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.587026709715292, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, - "loss": 0.044, - "macro_f1": 0.3333333432674408, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, - "routers_loss": 0.012814820744097233, + "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 @@ -11322,13 +11322,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, - "loss": 0.0407, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, - "routers_loss": 0.05977959558367729, + "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 @@ -11341,13 +11341,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.1533203125, + "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, - "loss": 0.0334, + "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, - "routers_loss": 0.031448643654584885, + "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 @@ -11360,13 +11360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, - "loss": 0.0523, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, - "routers_loss": 0.008164947852492332, + "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 @@ -11379,32 +11379,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, - "loss": 0.0478, + "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, - "routers_loss": 0.04045635461807251, + "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 5.633988846492516, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.1240234375, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, - "loss": 0.0447, - "macro_f1": 0.5866667032241821, + "loss": 0.0441, + "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, - "routers_loss": 0.06872973591089249, + "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 @@ -11412,18 +11412,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 5.64338127384796, - "f1_execute": 0.8695651888847351, + "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.25390625, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, - "loss": 0.0331, - "macro_f1": 0.4231884181499481, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, - "routers_loss": 0.20964151620864868, + "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 @@ -11442,26 +11442,26 @@ "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, - "routers_loss": 0.00690250750631094, + "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.66216612855885, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.14453125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, - "loss": 0.0372, - "macro_f1": 0.32098764181137085, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, - "routers_loss": 0.022315658628940582, + "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 @@ -11474,13 +11474,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1083984375, + "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, - "loss": 0.0388, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, - "routers_loss": 0.017015069723129272, + "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 @@ -11493,13 +11493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, - "loss": 0.0372, + "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, - "routers_loss": 0.006532609928399324, + "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 @@ -11512,13 +11512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, - "loss": 0.0301, + "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, - "routers_loss": 0.009720963425934315, + "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 @@ -11531,32 +11531,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, - "loss": 0.046, + "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, - "routers_loss": 0.03176085278391838, + "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, - "acc_skip": 0.5, - "avg_layers": 29.0, + "acc_skip": 0.0, + "avg_layers": 30.0, "epoch": 5.709128265336073, - "f1_execute": 0.9387754797935486, + "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.2021484375, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, - "loss": 0.0323, - "macro_f1": 0.7018141150474548, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, - "routers_loss": 0.08626245707273483, + "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 @@ -11569,13 +11569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, - "loss": 0.0374, + "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, - "routers_loss": 0.012099343352019787, + "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 @@ -11588,13 +11588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, - "loss": 0.0342, + "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, - "routers_loss": 0.007713846862316132, + "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 @@ -11607,13 +11607,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, - "loss": 0.0499, + "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, - "routers_loss": 0.004629489034414291, + "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 @@ -11626,13 +11626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, - "loss": 0.035, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, - "routers_loss": 0.010654795914888382, + "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 @@ -11640,18 +11640,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 5.756090402113296, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, - "loss": 0.0516, - "macro_f1": 0.31446540355682373, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, - "routers_loss": 0.05963074415922165, + "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 @@ -11664,13 +11664,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, - "loss": 0.04, + "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, - "routers_loss": 0.028920643031597137, + "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 @@ -11678,18 +11678,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.774875256824186, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, - "loss": 0.0327, - "macro_f1": 0.3333333432674408, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, - "routers_loss": 0.006852717138826847, + "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 @@ -11702,13 +11702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, - "loss": 0.0346, + "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, - "routers_loss": 0.010968753136694431, + "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 @@ -11721,13 +11721,13 @@ "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.1240234375, + "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, - "loss": 0.0344, + "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, - "routers_loss": 0.0906950980424881, + "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 @@ -11740,13 +11740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, - "loss": 0.0484, + "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, - "routers_loss": 0.016306072473526, + "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 @@ -11759,13 +11759,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.208984375, + "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, - "loss": 0.038, + "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, - "routers_loss": 0.05804471671581268, + "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 @@ -11773,18 +11773,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 5.821837393601409, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.091796875, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, - "loss": 0.0256, - "macro_f1": 0.5492662787437439, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, - "routers_loss": 0.045395996421575546, + "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 @@ -11797,13 +11797,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11767578125, + "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, - "loss": 0.0438, + "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, - "routers_loss": 0.020834850147366524, + "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 @@ -11816,13 +11816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, - "loss": 0.0377, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, - "routers_loss": 0.005241698585450649, + "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 @@ -11835,13 +11835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, - "loss": 0.038, + "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, - "routers_loss": 0.008387803100049496, + "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 @@ -11854,13 +11854,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, - "loss": 0.0256, + "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, - "routers_loss": 0.02261745184659958, + "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 @@ -11873,32 +11873,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.10693359375, + "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, - "loss": 0.0435, + "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, - "routers_loss": 0.21678555011749268, + "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.878191957734077, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, - "loss": 0.0358, - "macro_f1": 0.32098764181137085, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, - "routers_loss": 0.06554054468870163, + "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 @@ -11911,13 +11911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, - "loss": 0.0337, + "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, - "routers_loss": 0.007237187586724758, + "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 @@ -11930,13 +11930,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, - "loss": 0.0387, + "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, - "routers_loss": 0.08003793656826019, + "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 @@ -11949,13 +11949,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, - "loss": 0.0341, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, - "routers_loss": 0.006424390245229006, + "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 @@ -11968,13 +11968,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, - "loss": 0.0482, + "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, - "routers_loss": 0.10797854512929916, + "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 @@ -11987,13 +11987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, - "loss": 0.053, + "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, - "routers_loss": 0.006734046153724194, + "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 @@ -12006,13 +12006,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, - "loss": 0.0373, + "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, - "routers_loss": 0.00564212491735816, + "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 @@ -12025,13 +12025,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, - "loss": 0.0472, + "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, - "routers_loss": 0.026137735694646835, + "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 @@ -12044,13 +12044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, - "loss": 0.0343, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, - "routers_loss": 0.0069669694639742374, + "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 @@ -12063,13 +12063,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1142578125, + "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, - "loss": 0.0323, + "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, - "routers_loss": 0.05618409812450409, + "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 @@ -12084,11 +12084,11 @@ "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, - "loss": 0.0391, + "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, - "routers_loss": 0.025900620967149734, + "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 @@ -12101,13 +12101,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, - "loss": 0.0426, + "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, - "routers_loss": 0.006236737594008446, + "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 @@ -12120,13 +12120,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, - "loss": 0.0503, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, - "routers_loss": 0.006367063149809837, + "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 @@ -12139,13 +12139,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, - "loss": 0.0442, + "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, - "routers_loss": 0.003392914542928338, + "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 @@ -12158,13 +12158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, - "loss": 0.0231, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, - "routers_loss": 0.007376343477517366, + "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 @@ -12177,13 +12177,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, - "loss": 0.0243, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, - "routers_loss": 0.02773376554250717, + "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 @@ -12196,13 +12196,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, - "loss": 0.0363, + "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, - "routers_loss": 0.03535797819495201, + "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 @@ -12215,13 +12215,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, - "loss": 0.0269, + "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, - "routers_loss": 0.004910993855446577, + "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 @@ -12234,13 +12234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, - "loss": 0.0301, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, - "routers_loss": 0.0032444109674543142, + "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 @@ -12253,13 +12253,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.0947265625, + "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, - "loss": 0.0272, + "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, - "routers_loss": 0.12451831251382828, + "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 @@ -12272,13 +12272,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, - "loss": 0.0289, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, - "routers_loss": 0.011074979789555073, + "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 @@ -12291,13 +12291,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1806640625, + "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, - "loss": 0.0336, + "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, - "routers_loss": 0.01774786226451397, + "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 @@ -12310,13 +12310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, - "routers_loss": 0.0052874404937028885, + "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 @@ -12329,13 +12329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, - "loss": 0.031, + "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, - "routers_loss": 0.0034106262028217316, + "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 @@ -12348,13 +12348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2177734375, + "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, - "routers_loss": 0.010383229702711105, + "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 @@ -12367,13 +12367,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, - "loss": 0.0304, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, - "routers_loss": 0.0076674893498420715, + "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 @@ -12386,13 +12386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, - "loss": 0.0287, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, - "routers_loss": 0.010486516170203686, + "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 @@ -12405,13 +12405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, - "loss": 0.0237, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, - "routers_loss": 0.0023783023934811354, + "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 @@ -12424,13 +12424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, - "loss": 0.044, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, - "routers_loss": 0.006714595016092062, + "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 @@ -12443,13 +12443,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, - "loss": 0.0281, + "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, - "routers_loss": 0.022373953834176064, + "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 @@ -12462,13 +12462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, - "loss": 0.0205, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, - "routers_loss": 0.007452849764376879, + "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 @@ -12481,13 +12481,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, - "routers_loss": 0.016413308680057526, + "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 @@ -12500,13 +12500,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, - "loss": 0.0243, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, - "routers_loss": 0.004676427226513624, + "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 @@ -12519,13 +12519,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.123046875, + "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, - "loss": 0.0284, + "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, - "routers_loss": 0.024454625323414803, + "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 @@ -12538,13 +12538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, - "loss": 0.022, + "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, - "routers_loss": 0.013495884835720062, + "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 @@ -12557,13 +12557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, - "loss": 0.0213, + "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, - "routers_loss": 0.006397814955562353, + "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 @@ -12576,13 +12576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, - "loss": 0.0246, + "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, - "routers_loss": 0.00503434706479311, + "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 @@ -12595,13 +12595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, - "loss": 0.0402, + "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, - "routers_loss": 0.005150494631379843, + "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 @@ -12614,13 +12614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, - "loss": 0.041, + "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, - "routers_loss": 0.004570818971842527, + "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 @@ -12633,13 +12633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, - "loss": 0.0314, + "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, - "routers_loss": 0.005299333017319441, + "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 @@ -12652,13 +12652,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.111328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, - "loss": 0.0303, + "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, - "routers_loss": 0.168672576546669, + "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 @@ -12671,32 +12671,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, - "loss": 0.0302, + "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, - "routers_loss": 0.05187409743666649, + "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, "epoch": 6.272380393307896, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.9090909361839294, - "grad_norm": 0.1669921875, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, - "loss": 0.0339, - "macro_f1": 0.8329448699951172, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, - "routers_loss": 0.05786697566509247, + "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 @@ -12709,13 +12709,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, - "loss": 0.0315, + "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, - "routers_loss": 0.017055779695510864, + "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 @@ -12728,13 +12728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, - "loss": 0.0249, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, - "routers_loss": 0.011614206247031689, + "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 @@ -12747,13 +12747,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, - "loss": 0.033, + "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, - "routers_loss": 0.012611300684511662, + "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 @@ -12766,13 +12766,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, - "loss": 0.0315, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, - "routers_loss": 0.018757276237010956, + "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 @@ -12785,13 +12785,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, - "loss": 0.0229, + "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, - "routers_loss": 0.08197146654129028, + "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 @@ -12804,13 +12804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, - "loss": 0.0256, + "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, - "routers_loss": 0.014122758992016315, + "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 @@ -12823,13 +12823,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, - "loss": 0.0221, + "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, - "routers_loss": 0.08215996623039246, + "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 @@ -12842,13 +12842,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, - "loss": 0.0312, + "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, - "routers_loss": 0.026304977014660835, + "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 @@ -12861,13 +12861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, - "loss": 0.0302, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, - "routers_loss": 0.003616038942709565, + "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 @@ -12880,13 +12880,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0849609375, + "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, - "loss": 0.0323, + "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, - "routers_loss": 0.060399893671274185, + "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 @@ -12899,13 +12899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, - "loss": 0.0384, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, - "routers_loss": 0.007164204493165016, + "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 @@ -12918,13 +12918,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, - "loss": 0.0343, + "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, - "routers_loss": 0.010965060442686081, + "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 @@ -12937,13 +12937,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, - "loss": 0.0276, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, - "routers_loss": 0.00784136913716793, + "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 @@ -12956,13 +12956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, - "loss": 0.0251, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, - "routers_loss": 0.009101065807044506, + "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 @@ -12970,37 +12970,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 6.413266803639566, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, - "loss": 0.0206, - "macro_f1": 0.31446540355682373, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, - "routers_loss": 0.05967295169830322, + "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.25, + "avg_layers": 27.0, "epoch": 6.42265923099501, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1875, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, - "loss": 0.0356, - "macro_f1": 0.542222261428833, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, - "routers_loss": 0.05016552656888962, + "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 @@ -13013,13 +13013,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, - "loss": 0.038, + "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, - "routers_loss": 0.02483060024678707, + "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 @@ -13032,13 +13032,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, - "loss": 0.0373, + "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, - "routers_loss": 0.01982821337878704, + "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 @@ -13051,13 +13051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, - "loss": 0.0353, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, - "routers_loss": 0.004753436427563429, + "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 @@ -13070,13 +13070,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, - "loss": 0.0246, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, - "routers_loss": 0.06541594862937927, + "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 @@ -13089,13 +13089,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, - "loss": 0.0376, + "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, - "routers_loss": 0.01156456395983696, + "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 @@ -13108,13 +13108,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2392578125, + "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, - "loss": 0.033, + "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, - "routers_loss": 0.05175521597266197, + "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 @@ -13127,13 +13127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, - "loss": 0.0251, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, - "routers_loss": 0.002684591803699732, + "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 @@ -13146,13 +13146,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, - "loss": 0.0397, + "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, - "routers_loss": 0.054509978741407394, + "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 @@ -13165,13 +13165,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, - "routers_loss": 0.04031623527407646, + "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 @@ -13184,13 +13184,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, - "loss": 0.0308, + "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, - "routers_loss": 0.039687711745500565, + "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 @@ -13203,13 +13203,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, - "loss": 0.0329, + "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, - "routers_loss": 0.04785723611712456, + "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 @@ -13222,13 +13222,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, - "loss": 0.0359, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, - "routers_loss": 0.046765491366386414, + "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 @@ -13241,13 +13241,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, - "loss": 0.0303, + "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, - "routers_loss": 0.015151665546000004, + "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 @@ -13260,13 +13260,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, - "loss": 0.0321, + "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, - "routers_loss": 0.04431106895208359, + "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 @@ -13279,13 +13279,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, - "loss": 0.0317, + "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, - "routers_loss": 0.012700649909675121, + "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 @@ -13298,13 +13298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, - "loss": 0.0256, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, - "routers_loss": 0.005919010378420353, + "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 @@ -13317,13 +13317,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, - "loss": 0.0372, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, - "routers_loss": 0.04717765748500824, + "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 @@ -13336,13 +13336,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, - "routers_loss": 0.015415813773870468, + "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 @@ -13355,13 +13355,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, - "routers_loss": 0.06812979280948639, + "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 @@ -13374,13 +13374,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05908203125, + "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, - "loss": 0.0265, + "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, - "routers_loss": 0.025383235886693, + "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 @@ -13393,13 +13393,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, - "loss": 0.0367, + "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, - "routers_loss": 0.026493225246667862, + "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 @@ -13412,13 +13412,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, - "loss": 0.0342, + "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, - "routers_loss": 0.006616846192628145, + "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 @@ -13431,32 +13431,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0888671875, + "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, - "loss": 0.0328, + "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, - "routers_loss": 0.060631848871707916, + "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.648077487525683, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, - "loss": 0.0317, - "macro_f1": 0.6666666865348816, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, - "routers_loss": 0.011199389584362507, + "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 @@ -13469,13 +13469,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, - "loss": 0.0192, + "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, - "routers_loss": 0.01120354700833559, + "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 @@ -13488,13 +13488,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, - "loss": 0.0294, + "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, - "routers_loss": 0.030204148963093758, + "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 @@ -13507,13 +13507,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, - "loss": 0.0348, + "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, - "routers_loss": 0.008244800381362438, + "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 @@ -13526,32 +13526,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, - "loss": 0.0269, + "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, - "routers_loss": 0.015340043231844902, + "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, - "acc_skip": 0.6000000238418579, - "avg_layers": 25.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, "epoch": 6.695039624302906, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, - "f1_skip": 0.75, - "grad_norm": 0.1318359375, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, - "loss": 0.0341, - "macro_f1": 0.5694444179534912, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, - "routers_loss": 0.058681465685367584, + "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 @@ -13564,32 +13564,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, - "loss": 0.0423, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, - "routers_loss": 0.020810559391975403, + "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 6.713824479013795, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.09033203125, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, - "loss": 0.0268, - "macro_f1": 0.5492662787437439, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, - "routers_loss": 0.030001837760210037, + "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 @@ -13602,13 +13602,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, - "loss": 0.034, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, - "routers_loss": 0.010381575673818588, + "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 @@ -13621,32 +13621,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, - "loss": 0.0365, + "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, - "routers_loss": 0.03234704211354256, + "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.742001761080129, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, - "loss": 0.0303, - "macro_f1": 0.6666666865348816, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, - "routers_loss": 0.015481291338801384, + "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 @@ -13659,13 +13659,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, - "loss": 0.0366, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, - "routers_loss": 0.018170451745390892, + "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 @@ -13678,13 +13678,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, - "loss": 0.0306, + "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, - "routers_loss": 0.010385681875050068, + "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 @@ -13697,13 +13697,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, - "loss": 0.0407, + "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, - "routers_loss": 0.04976538568735123, + "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 @@ -13716,13 +13716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, - "loss": 0.0255, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, - "routers_loss": 0.0030266831163316965, + "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 @@ -13730,18 +13730,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 6.788963897857353, - "f1_execute": 0.9600000381469727, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.107421875, + "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, - "loss": 0.0411, - "macro_f1": 0.8200000524520874, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, - "routers_loss": 0.13420957326889038, + "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 @@ -13754,13 +13754,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1953125, + "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, - "loss": 0.0251, + "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, - "routers_loss": 0.012779864482581615, + "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 @@ -13773,13 +13773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, - "loss": 0.0266, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, - "routers_loss": 0.005545398220419884, + "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 @@ -13792,13 +13792,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, - "loss": 0.0395, + "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, - "routers_loss": 0.0899835154414177, + "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 @@ -13811,13 +13811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, - "routers_loss": 0.002738836221396923, + "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 @@ -13830,13 +13830,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, - "loss": 0.0266, + "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, - "routers_loss": 0.020522192120552063, + "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 @@ -13844,18 +13844,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.84531846199002, - "f1_execute": 0.9629629850387573, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, - "loss": 0.0197, - "macro_f1": 0.32098767161369324, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, - "routers_loss": 0.04245268926024437, + "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 @@ -13868,13 +13868,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, - "loss": 0.0376, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, - "routers_loss": 0.009142685681581497, + "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 @@ -13887,13 +13887,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, - "loss": 0.0295, + "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, - "routers_loss": 0.08038893342018127, + "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 @@ -13906,13 +13906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, - "loss": 0.027, + "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, - "routers_loss": 0.02107175625860691, + "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 @@ -13925,13 +13925,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, - "loss": 0.0349, + "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, - "routers_loss": 0.042030055075883865, + "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 @@ -13944,13 +13944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, - "loss": 0.022, + "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, - "routers_loss": 0.004230673424899578, + "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 @@ -13963,13 +13963,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, - "loss": 0.0284, + "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, - "routers_loss": 0.06986775249242783, + "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 @@ -13982,13 +13982,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, - "loss": 0.0431, + "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, - "routers_loss": 0.0439564548432827, + "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 @@ -14001,13 +14001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, - "loss": 0.0376, + "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, - "routers_loss": 0.011889892630279064, + "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 @@ -14020,13 +14020,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, - "loss": 0.0295, + "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, - "routers_loss": 0.022536326199769974, + "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 @@ -14039,13 +14039,13 @@ "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, - "loss": 0.0388, + "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, - "routers_loss": 0.07959948480129242, + "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 @@ -14058,13 +14058,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, - "loss": 0.0341, + "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, - "routers_loss": 0.019112225621938705, + "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 @@ -14077,13 +14077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, - "loss": 0.0345, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, - "routers_loss": 0.018750866875052452, + "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 @@ -14096,32 +14096,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, - "loss": 0.0291, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, - "routers_loss": 0.015407348051667213, + "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { - "acc_repeat": 0.6666666865348816, + "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 29.0, "epoch": 6.976812444966246, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.800000011920929, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, - "loss": 0.0258, - "macro_f1": 0.5934640765190125, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, - "routers_loss": 0.06514479219913483, + "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 @@ -14134,13 +14134,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, - "loss": 0.0217, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, - "routers_loss": 0.013567833229899406, + "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 @@ -14153,13 +14153,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, - "loss": 0.0389, + "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, - "routers_loss": 0.13762018084526062, + "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 @@ -14172,13 +14172,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, - "loss": 0.0214, + "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, - "routers_loss": 0.008640666492283344, + "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 @@ -14191,13 +14191,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, - "routers_loss": 0.0018257038900628686, + "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 @@ -14210,13 +14210,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, - "routers_loss": 0.003656312357634306, + "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 @@ -14229,13 +14229,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, - "loss": 0.0246, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, - "routers_loss": 0.044268425554037094, + "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 @@ -14248,13 +14248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, - "loss": 0.0207, + "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, - "routers_loss": 0.0018966116476804018, + "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 @@ -14267,32 +14267,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, - "loss": 0.0249, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, - "routers_loss": 0.01729201152920723, + "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 7.06105077781039, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.11962890625, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, - "loss": 0.0248, - "macro_f1": 0.5492662787437439, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, - "routers_loss": 0.01693531684577465, + "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 @@ -14305,13 +14305,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, - "loss": 0.0197, + "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, - "routers_loss": 0.04939094930887222, + "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 @@ -14324,13 +14324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, - "loss": 0.0213, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, - "routers_loss": 0.0016892930725589395, + "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 @@ -14343,13 +14343,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, - "loss": 0.0147, + "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, - "routers_loss": 0.008671467192471027, + "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 @@ -14362,13 +14362,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, - "loss": 0.0228, + "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, - "routers_loss": 0.042682576924562454, + "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 @@ -14381,13 +14381,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, - "loss": 0.028, + "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, - "routers_loss": 0.02905822917819023, + "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 @@ -14400,13 +14400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, - "loss": 0.0223, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, - "routers_loss": 0.0018810008186846972, + "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 @@ -14419,13 +14419,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, - "loss": 0.0219, + "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, - "routers_loss": 0.04308714717626572, + "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 @@ -14438,13 +14438,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, - "loss": 0.0232, + "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, - "routers_loss": 0.003754811594262719, + "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 @@ -14457,32 +14457,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, - "loss": 0.0246, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, - "routers_loss": 0.010853761807084084, + "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 7.154975051364837, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, - "loss": 0.0196, - "macro_f1": 1.0, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, - "routers_loss": 0.015100379474461079, + "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 @@ -14495,13 +14495,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, - "loss": 0.0189, + "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, - "routers_loss": 0.011991916224360466, + "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 @@ -14514,32 +14514,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, - "loss": 0.0226, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, - "routers_loss": 0.008201062679290771, + "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.183152333431171, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1181640625, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, - "loss": 0.0174, - "macro_f1": 0.6666666865348816, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, - "routers_loss": 0.008513177745044231, + "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 @@ -14552,13 +14552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, - "loss": 0.02, + "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, - "routers_loss": 0.004850955214351416, + "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 @@ -14571,32 +14571,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.087890625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, - "loss": 0.0206, + "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, - "routers_loss": 0.0027650354895740747, + "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, - "acc_skip": 0.0, - "avg_layers": 29.0, + "acc_skip": 0.5, + "avg_layers": 28.0, "epoch": 7.2113296154975055, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, - "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, - "loss": 0.0209, - "macro_f1": 0.6538461446762085, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, - "routers_loss": 0.023268593475222588, + "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 @@ -14609,13 +14609,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, - "routers_loss": 0.031235001981258392, + "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 @@ -14628,13 +14628,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.080078125, + "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, - "loss": 0.023, + "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, - "routers_loss": 0.042398080229759216, + "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 @@ -14647,13 +14647,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.099609375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, - "loss": 0.0268, + "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, - "routers_loss": 0.007361465133726597, + "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 @@ -14666,13 +14666,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, - "loss": 0.0166, + "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, - "routers_loss": 0.0052334014326334, + "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 @@ -14685,13 +14685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, - "routers_loss": 0.004993532784283161, + "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 @@ -14704,13 +14704,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, - "loss": 0.0248, + "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, - "routers_loss": 0.001611889572814107, + "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 @@ -14723,13 +14723,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, - "loss": 0.0209, + "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, - "routers_loss": 0.03059260919690132, + "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 @@ -14742,32 +14742,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, - "loss": 0.0195, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, - "routers_loss": 0.00508903618901968, + "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.295861461696507, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, + "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, - "loss": 0.0174, - "macro_f1": 0.3272727429866791, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, - "routers_loss": 0.007860450074076653, + "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 @@ -14780,13 +14780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, - "loss": 0.0217, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, - "routers_loss": 0.0027370608877390623, + "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 @@ -14799,13 +14799,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, - "loss": 0.0276, + "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, - "routers_loss": 0.061584725975990295, + "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 @@ -14818,13 +14818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, - "routers_loss": 0.01694316789507866, + "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 @@ -14837,13 +14837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, - "loss": 0.0234, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, - "routers_loss": 0.0023331891279667616, + "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 @@ -14856,13 +14856,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, - "loss": 0.0203, + "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, - "routers_loss": 0.021300682798027992, + "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 @@ -14875,13 +14875,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, - "loss": 0.026, + "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, - "routers_loss": 0.08335043489933014, + "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 @@ -14894,13 +14894,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, - "loss": 0.0224, + "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, - "routers_loss": 0.003535634372383356, + "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 @@ -14913,13 +14913,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, - "routers_loss": 0.025729363784193993, + "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 @@ -14932,13 +14932,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, - "loss": 0.0287, + "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, - "routers_loss": 0.059166863560676575, + "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 @@ -14951,13 +14951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, - "loss": 0.0173, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, - "routers_loss": 0.002580022206529975, + "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 @@ -14970,13 +14970,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, - "loss": 0.0257, + "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, - "routers_loss": 0.007746981456875801, + "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 @@ -14989,13 +14989,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, - "loss": 0.0344, + "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, - "routers_loss": 0.027308562770485878, + "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 @@ -15008,13 +15008,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, - "loss": 0.0277, + "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, - "routers_loss": 0.029863199219107628, + "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 @@ -15027,13 +15027,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, - "loss": 0.0218, + "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, - "routers_loss": 0.004019706044346094, + "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 @@ -15046,32 +15046,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, - "loss": 0.0239, + "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, - "routers_loss": 0.014162382110953331, + "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 7.446140299383622, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, - "loss": 0.0338, - "macro_f1": 0.32098764181137085, + "loss": 0.032, + "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, - "routers_loss": 0.023485012352466583, + "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 @@ -15079,37 +15079,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 7.455532726739067, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, - "loss": 0.0308, - "macro_f1": 0.3272727429866791, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, - "routers_loss": 0.05822715163230896, + "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { - "acc_repeat": 0.5, - "acc_skip": 0.5, + "acc_repeat": 1.0, + "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, - "f1_execute": 0.936170220375061, - "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.189453125, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, - "loss": 0.0243, - "macro_f1": 0.7565011978149414, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, - "routers_loss": 0.07448136061429977, + "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 @@ -15122,13 +15122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, - "loss": 0.0228, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, - "routers_loss": 0.0038424162194132805, + "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 @@ -15141,13 +15141,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, - "loss": 0.0277, + "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, - "routers_loss": 0.005674343090504408, + "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 @@ -15160,13 +15160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.189453125, + "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, - "loss": 0.0209, + "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, - "routers_loss": 0.015544800087809563, + "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 @@ -15179,13 +15179,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, - "loss": 0.0279, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, - "routers_loss": 0.013199405744671822, + "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 @@ -15198,13 +15198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, - "loss": 0.0178, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, - "routers_loss": 0.0032487998250871897, + "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 @@ -15217,13 +15217,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, - "loss": 0.0253, + "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, - "routers_loss": 0.0041928659193217754, + "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 @@ -15236,13 +15236,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, - "loss": 0.0343, + "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, - "routers_loss": 0.011576149612665176, + "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 @@ -15255,13 +15255,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, - "loss": 0.0179, + "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, - "routers_loss": 0.03026912547647953, + "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 @@ -15276,11 +15276,11 @@ "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, - "loss": 0.021, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, - "routers_loss": 0.014957098290324211, + "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 @@ -15293,13 +15293,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, - "loss": 0.0226, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, - "routers_loss": 0.00254683755338192, + "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 @@ -15312,13 +15312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, - "loss": 0.0286, + "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, - "routers_loss": 0.018759876489639282, + "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 @@ -15337,7 +15337,7 @@ "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, - "routers_loss": 0.022694367915391922, + "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 @@ -15350,13 +15350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, - "loss": 0.0181, + "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, - "routers_loss": 0.010102321393787861, + "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 @@ -15369,13 +15369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08984375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, - "loss": 0.0252, + "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, - "routers_loss": 0.0020994991064071655, + "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 @@ -15388,13 +15388,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.10009765625, + "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, - "loss": 0.0323, + "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, - "routers_loss": 0.11748704314231873, + "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 @@ -15407,32 +15407,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, - "loss": 0.018, + "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, - "routers_loss": 0.007642311509698629, + "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.62459641913707, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "f1_skip": 1.0, + "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, - "loss": 0.0258, - "macro_f1": 0.3272727429866791, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, - "routers_loss": 0.016890225932002068, + "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 @@ -15445,13 +15445,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, - "loss": 0.0242, + "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, - "routers_loss": 0.010900186374783516, + "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 @@ -15464,13 +15464,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, - "loss": 0.0279, + "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, - "routers_loss": 0.011229799129068851, + "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 @@ -15483,13 +15483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, - "loss": 0.0275, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, - "routers_loss": 0.0105878422036767, + "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 @@ -15502,13 +15502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, - "loss": 0.0209, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, - "routers_loss": 0.002953991526737809, + "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 @@ -15521,13 +15521,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, - "loss": 0.0241, + "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, - "routers_loss": 0.04717390984296799, + "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 @@ -15540,13 +15540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, - "loss": 0.0179, + "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, - "routers_loss": 0.0073657832108438015, + "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 @@ -15559,13 +15559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, - "loss": 0.0259, + "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, - "routers_loss": 0.005212752148509026, + "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 @@ -15578,13 +15578,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, - "loss": 0.0304, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, - "routers_loss": 0.04311618581414223, + "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 @@ -15597,13 +15597,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, - "loss": 0.039, + "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, - "routers_loss": 0.02027471922338009, + "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 @@ -15616,13 +15616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, - "loss": 0.0279, + "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, - "routers_loss": 0.003074956126511097, + "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 @@ -15635,13 +15635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, - "loss": 0.0241, + "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, - "routers_loss": 0.009374706074595451, + "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 @@ -15654,13 +15654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, - "routers_loss": 0.01204691268503666, + "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 @@ -15673,13 +15673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, - "routers_loss": 0.01376053225249052, + "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 @@ -15692,13 +15692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, - "loss": 0.0285, + "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, - "routers_loss": 0.009346984326839447, + "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 @@ -15711,13 +15711,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, - "loss": 0.0291, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, - "routers_loss": 0.002724624238908291, + "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 @@ -15730,13 +15730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.123046875, + "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, - "loss": 0.0271, + "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, - "routers_loss": 0.00823777075856924, + "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 @@ -15749,32 +15749,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, - "loss": 0.0262, + "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, - "routers_loss": 0.004393119364976883, + "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.7936601115350745, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1513671875, + "f1_skip": 0.0, + "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, - "loss": 0.024, - "macro_f1": 0.5492662787437439, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, - "routers_loss": 0.031827569007873535, + "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 @@ -15789,11 +15789,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, - "loss": 0.0216, + "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, - "routers_loss": 0.03329647704958916, + "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 @@ -15806,13 +15806,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, - "loss": 0.0415, + "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, - "routers_loss": 0.047317031770944595, + "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 @@ -15827,11 +15827,11 @@ "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, - "loss": 0.0325, + "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, - "routers_loss": 0.05649980902671814, + "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 @@ -15844,13 +15844,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, - "loss": 0.0229, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, - "routers_loss": 0.01004624180495739, + "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 @@ -15863,13 +15863,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, - "loss": 0.0194, + "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, - "routers_loss": 0.0054973396472632885, + "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 @@ -15882,13 +15882,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09619140625, + "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, - "loss": 0.031, + "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, - "routers_loss": 0.05615650862455368, + "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 @@ -15901,13 +15901,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, - "loss": 0.0301, + "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, - "routers_loss": 0.012819192372262478, + "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 @@ -15920,13 +15920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, - "loss": 0.0253, + "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, - "routers_loss": 0.07059332728385925, + "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 @@ -15939,13 +15939,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, - "loss": 0.0198, + "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, - "routers_loss": 0.029778441414237022, + "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 @@ -15958,13 +15958,13 @@ "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, - "grad_norm": 0.1416015625, + "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, - "loss": 0.0474, + "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, - "routers_loss": 0.04371272772550583, + "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 @@ -15977,13 +15977,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, - "loss": 0.0293, + "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, - "routers_loss": 0.0033591394312679768, + "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 @@ -15996,13 +15996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, - "loss": 0.0259, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, - "routers_loss": 0.005085585173219442, + "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 @@ -16015,13 +16015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, - "loss": 0.0243, + "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, - "routers_loss": 0.008569681085646152, + "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 @@ -16034,32 +16034,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, - "loss": 0.022, + "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, - "routers_loss": 0.024587804451584816, + "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, - "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, "epoch": 7.934546521866745, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.169921875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, - "loss": 0.0332, - "macro_f1": 0.4871794879436493, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, - "routers_loss": 0.037516288459300995, + "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 @@ -16072,13 +16072,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, - "loss": 0.0262, + "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, - "routers_loss": 0.01287431176751852, + "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 @@ -16091,13 +16091,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07763671875, + "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, - "loss": 0.0227, + "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, - "routers_loss": 0.015499880537390709, + "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 @@ -16110,13 +16110,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, - "loss": 0.0144, + "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, - "routers_loss": 0.049878787249326706, + "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 @@ -16129,13 +16129,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, - "loss": 0.0206, + "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, - "routers_loss": 0.04108169302344322, + "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 @@ -16148,13 +16148,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, - "loss": 0.0367, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, - "routers_loss": 0.007651917636394501, + "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 @@ -16167,13 +16167,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, - "routers_loss": 0.015448091551661491, + "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 @@ -16186,13 +16186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, - "loss": 0.0324, + "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, - "routers_loss": 0.009139287285506725, + "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 @@ -16205,13 +16205,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0869140625, + "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, - "loss": 0.0191, + "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, - "routers_loss": 0.015412120148539543, + "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 @@ -16224,13 +16224,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, - "routers_loss": 0.012735052965581417, + "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 @@ -16243,13 +16243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, - "loss": 0.0192, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, - "routers_loss": 0.00114025070797652, + "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 @@ -16262,13 +16262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, - "loss": 0.0243, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, - "routers_loss": 0.006401443853974342, + "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 @@ -16281,13 +16281,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, - "loss": 0.0185, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, - "routers_loss": 0.004316259175539017, + "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 @@ -16300,13 +16300,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, - "loss": 0.0201, + "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, - "routers_loss": 0.043461959809064865, + "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 @@ -16319,13 +16319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, - "routers_loss": 0.0024530428927391768, + "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 @@ -16338,13 +16338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, - "loss": 0.026, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, - "routers_loss": 0.0046626063995063305, + "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 @@ -16357,13 +16357,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1513671875, + "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, - "loss": 0.0257, + "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, - "routers_loss": 0.02480102889239788, + "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 @@ -16376,13 +16376,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, - "loss": 0.02, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, - "routers_loss": 0.012629947625100613, + "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 @@ -16395,13 +16395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, - "routers_loss": 0.0024127380456775427, + "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 @@ -16414,13 +16414,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, - "loss": 0.0162, + "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, - "routers_loss": 0.0415453165769577, + "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 @@ -16433,13 +16433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, - "loss": 0.0211, + "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, - "routers_loss": 0.0056681083515286446, + "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 @@ -16452,13 +16452,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, - "loss": 0.0189, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, - "routers_loss": 0.006391602102667093, + "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 @@ -16471,13 +16471,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, - "loss": 0.0229, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, - "routers_loss": 0.007466991897672415, + "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 @@ -16490,13 +16490,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, - "loss": 0.0197, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, - "routers_loss": 0.001953453291207552, + "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 @@ -16509,13 +16509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, - "loss": 0.0223, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, - "routers_loss": 0.003612719476222992, + "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 @@ -16528,13 +16528,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, - "routers_loss": 0.009977150708436966, + "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 @@ -16547,13 +16547,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.10791015625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, - "routers_loss": 0.026468059048056602, + "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 @@ -16566,13 +16566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, - "routers_loss": 0.00849854201078415, + "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 @@ -16585,13 +16585,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, - "loss": 0.0275, + "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, - "routers_loss": 0.08082596957683563, + "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 @@ -16604,13 +16604,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.046142578125, + "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, - "loss": 0.015, + "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, - "routers_loss": 0.029500717297196388, + "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 @@ -16623,13 +16623,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, - "loss": 0.0183, + "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, - "routers_loss": 0.025238536298274994, + "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 @@ -16642,13 +16642,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, - "loss": 0.0204, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, - "routers_loss": 0.002069319598376751, + "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 @@ -16661,13 +16661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, - "loss": 0.0169, + "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, - "routers_loss": 0.002853946527466178, + "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 @@ -16680,13 +16680,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, - "loss": 0.0236, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, - "routers_loss": 0.0100983502343297, + "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 @@ -16699,13 +16699,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, - "loss": 0.0173, + "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, - "routers_loss": 0.031218983232975006, + "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 @@ -16718,13 +16718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, - "loss": 0.019, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, - "routers_loss": 0.010347879491746426, + "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 @@ -16737,13 +16737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, - "loss": 0.0193, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, - "routers_loss": 0.007768871728330851, + "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 @@ -16756,13 +16756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, - "loss": 0.0253, + "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, - "routers_loss": 0.002887974726036191, + "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 @@ -16777,11 +16777,11 @@ "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, - "loss": 0.0147, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, - "routers_loss": 0.0027294005267322063, + "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 @@ -16794,13 +16794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, - "loss": 0.0193, + "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, - "routers_loss": 0.00543541694059968, + "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 @@ -16813,13 +16813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, - "routers_loss": 0.006514009553939104, + "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 @@ -16832,13 +16832,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.06396484375, + "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, - "loss": 0.019, + "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, - "routers_loss": 0.02333846502006054, + "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 @@ -16851,13 +16851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, - "routers_loss": 0.004471905063837767, + "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 @@ -16870,13 +16870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, - "loss": 0.0261, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, - "routers_loss": 0.0024362702388316393, + "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 @@ -16889,13 +16889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, - "loss": 0.0116, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, - "routers_loss": 0.0021166049409657717, + "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 @@ -16908,13 +16908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, - "routers_loss": 0.004722296260297298, + "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 @@ -16929,11 +16929,11 @@ "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, - "loss": 0.0199, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, - "routers_loss": 0.014188882894814014, + "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 @@ -16946,13 +16946,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.076171875, + "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, - "loss": 0.0241, + "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, - "routers_loss": 0.04599560424685478, + "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 @@ -16965,13 +16965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, - "loss": 0.0216, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, - "routers_loss": 0.004942454397678375, + "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 @@ -16984,13 +16984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, - "loss": 0.021, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, - "routers_loss": 0.0020542226266115904, + "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 @@ -17003,13 +17003,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1171875, + "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, - "loss": 0.0155, + "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, - "routers_loss": 0.0397041030228138, + "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 @@ -17022,13 +17022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, - "loss": 0.0204, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, - "routers_loss": 0.0017490010941401124, + "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 @@ -17041,13 +17041,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, - "loss": 0.021, + "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, - "routers_loss": 0.023590171709656715, + "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 @@ -17060,13 +17060,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, - "loss": 0.0125, + "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, - "routers_loss": 0.02458076737821102, + "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 @@ -17079,13 +17079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, - "loss": 0.019, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, - "routers_loss": 0.0014341498026624322, + "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 @@ -17098,13 +17098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, - "loss": 0.02, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, - "routers_loss": 0.00213223067112267, + "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 @@ -17117,13 +17117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, - "loss": 0.0147, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, - "routers_loss": 0.0027340995147824287, + "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 @@ -17136,32 +17136,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, - "loss": 0.0172, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, - "routers_loss": 0.0035587961319833994, + "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 8.479013795127678, - "f1_execute": 0.9615384340286255, - "f1_repeat": 0.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, - "loss": 0.0187, - "macro_f1": 0.5427350401878357, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, - "routers_loss": 0.04446055367588997, + "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 @@ -17174,13 +17174,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, - "loss": 0.0244, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, - "routers_loss": 0.05095123499631882, + "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 @@ -17193,13 +17193,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, - "loss": 0.0232, + "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, - "routers_loss": 0.008440068922936916, + "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 @@ -17212,13 +17212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, - "loss": 0.0273, + "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, - "routers_loss": 0.012230116873979568, + "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 @@ -17231,13 +17231,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, - "loss": 0.026, + "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, - "routers_loss": 0.017307188361883163, + "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 @@ -17250,13 +17250,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, - "loss": 0.0215, + "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, - "routers_loss": 0.07191162556409836, + "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 @@ -17269,13 +17269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, - "loss": 0.0182, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, - "routers_loss": 0.008753604255616665, + "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 @@ -17288,13 +17288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, - "loss": 0.0233, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, - "routers_loss": 0.008390828967094421, + "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 @@ -17307,13 +17307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, - "routers_loss": 0.005617359187453985, + "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 @@ -17326,13 +17326,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, - "loss": 0.0227, + "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, - "routers_loss": 0.0346846878528595, + "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 @@ -17345,32 +17345,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, - "routers_loss": 0.006601692643016577, + "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 8.582330496037569, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, - "loss": 0.0207, - "macro_f1": 0.6666666865348816, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, - "routers_loss": 0.0065619745291769505, + "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 @@ -17383,13 +17383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, - "loss": 0.0149, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, - "routers_loss": 0.011109639890491962, + "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 @@ -17404,11 +17404,11 @@ "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, - "loss": 0.0167, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, - "routers_loss": 0.008432094007730484, + "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 @@ -17421,13 +17421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, - "loss": 0.0208, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, - "routers_loss": 0.011518111452460289, + "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 @@ -17440,13 +17440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, - "loss": 0.0211, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, - "routers_loss": 0.0026744187343865633, + "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 @@ -17459,13 +17459,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, - "loss": 0.0165, + "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, - "routers_loss": 0.028107430785894394, + "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 @@ -17478,13 +17478,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.095703125, + "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, - "loss": 0.0263, + "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, - "routers_loss": 0.060007549822330475, + "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 @@ -17497,13 +17497,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, - "routers_loss": 0.01074182614684105, + "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 @@ -17516,13 +17516,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, - "loss": 0.0213, + "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, - "routers_loss": 0.0019638657104223967, + "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 @@ -17535,13 +17535,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1572265625, + "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, - "loss": 0.0156, + "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, - "routers_loss": 0.06953249871730804, + "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 @@ -17554,13 +17554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, - "loss": 0.0154, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, - "routers_loss": 0.003563052974641323, + "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 @@ -17573,13 +17573,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, - "loss": 0.0216, + "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, - "routers_loss": 0.010409255512058735, + "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 @@ -17592,13 +17592,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, - "loss": 0.0195, + "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, - "routers_loss": 0.009769548662006855, + "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 @@ -17611,13 +17611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, - "loss": 0.0184, + "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, - "routers_loss": 0.036616452038288116, + "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 @@ -17630,13 +17630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, - "loss": 0.0192, + "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, - "routers_loss": 0.009581349790096283, + "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 @@ -17649,13 +17649,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, - "loss": 0.0138, + "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, - "routers_loss": 0.02330300398170948, + "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 @@ -17668,13 +17668,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, - "loss": 0.0226, + "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, - "routers_loss": 0.011985735036432743, + "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 @@ -17687,13 +17687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, - "loss": 0.0162, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, - "routers_loss": 0.005997250322252512, + "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 @@ -17706,13 +17706,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, - "loss": 0.0243, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, - "routers_loss": 0.004814761225134134, + "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 @@ -17725,13 +17725,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, - "loss": 0.0242, + "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, - "routers_loss": 0.004750931169837713, + "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 @@ -17744,13 +17744,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, - "loss": 0.0216, + "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, - "routers_loss": 0.038251202553510666, + "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 @@ -17763,13 +17763,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06640625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, - "loss": 0.0204, + "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, - "routers_loss": 0.010951942764222622, + "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 @@ -17782,13 +17782,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, - "loss": 0.0265, + "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, - "routers_loss": 0.06582094728946686, + "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 @@ -17796,18 +17796,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 8.798356325212797, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "f1_skip": 1.0, + "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, - "loss": 0.0143, - "macro_f1": 0.5492662787437439, + "loss": 0.014, + "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, - "routers_loss": 0.008920271880924702, + "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 @@ -17820,13 +17820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, - "routers_loss": 0.006444344762712717, + "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 @@ -17839,13 +17839,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, - "loss": 0.022, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, - "routers_loss": 0.05197767913341522, + "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 @@ -17858,13 +17858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, - "loss": 0.0224, + "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, - "routers_loss": 0.017570581287145615, + "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 @@ -17877,13 +17877,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, - "loss": 0.0225, + "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, - "routers_loss": 0.07106777280569077, + "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 @@ -17896,13 +17896,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, - "loss": 0.0139, + "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, - "routers_loss": 0.009862381964921951, + "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 @@ -17915,13 +17915,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, - "loss": 0.0209, + "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, - "routers_loss": 0.006928981747478247, + "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 @@ -17934,13 +17934,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09423828125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, - "routers_loss": 0.04788029566407204, + "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 @@ -17953,13 +17953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, - "loss": 0.0299, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, - "routers_loss": 0.008282946422696114, + "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 @@ -17972,32 +17972,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.09716796875, + "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, - "loss": 0.0181, + "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, - "routers_loss": 0.03251546248793602, + "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 8.892280598767243, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06640625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, - "loss": 0.0195, - "macro_f1": 0.542222261428833, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, - "routers_loss": 0.03368280455470085, + "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 @@ -18010,13 +18010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, - "routers_loss": 0.0069940583780407906, + "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 @@ -18029,13 +18029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, - "loss": 0.0221, + "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, - "routers_loss": 0.004268508404493332, + "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 @@ -18048,13 +18048,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, - "loss": 0.0159, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, - "routers_loss": 0.0032616283278912306, + "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 @@ -18067,13 +18067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, - "loss": 0.023, + "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, - "routers_loss": 0.005389219615608454, + "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 @@ -18086,13 +18086,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, - "loss": 0.0311, + "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, - "routers_loss": 0.024814991280436516, + "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 @@ -18107,11 +18107,11 @@ "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, - "loss": 0.0151, + "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, - "routers_loss": 0.013415839523077011, + "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 @@ -18124,13 +18124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, - "loss": 0.019, + "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, - "routers_loss": 0.005814475007355213, + "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 @@ -18143,13 +18143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, - "loss": 0.0218, + "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, - "routers_loss": 0.007621586322784424, + "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 @@ -18162,13 +18162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, - "loss": 0.0178, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, - "routers_loss": 0.0020917020738124847, + "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 @@ -18181,13 +18181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, - "loss": 0.025, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, - "routers_loss": 0.0010824954370036721, + "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 @@ -18200,13 +18200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, - "routers_loss": 0.0018075350672006607, + "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 @@ -18219,13 +18219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, - "loss": 0.0235, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, - "routers_loss": 0.0032930250745266676, + "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 @@ -18238,13 +18238,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, - "loss": 0.0184, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, - "routers_loss": 0.009042349644005299, + "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 @@ -18257,13 +18257,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1103515625, + "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, - "loss": 0.022, + "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, - "routers_loss": 0.016776500269770622, + "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 @@ -18276,13 +18276,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05029296875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, - "loss": 0.016, + "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, - "routers_loss": 0.06579705327749252, + "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 @@ -18295,13 +18295,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, - "routers_loss": 0.0022786022163927555, + "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 @@ -18314,13 +18314,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, - "routers_loss": 0.01074281521141529, + "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 @@ -18333,13 +18333,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, - "loss": 0.0201, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, - "routers_loss": 0.0032052614260464907, + "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 @@ -18352,32 +18352,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, - "loss": 0.0186, + "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, - "routers_loss": 0.008593574166297913, + "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.079835632521279, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.07373046875, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, - "loss": 0.0152, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, - "routers_loss": 0.0201246440410614, + "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 @@ -18390,13 +18390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, - "loss": 0.0199, + "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, - "routers_loss": 0.001721356064081192, + "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 @@ -18409,13 +18409,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, - "loss": 0.0135, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, - "routers_loss": 0.010442634113132954, + "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 @@ -18428,13 +18428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, - "loss": 0.019, + "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, - "routers_loss": 0.0009493798715993762, + "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 @@ -18447,13 +18447,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, - "loss": 0.0149, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, - "routers_loss": 0.022104881703853607, + "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 @@ -18466,13 +18466,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, - "loss": 0.0164, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, - "routers_loss": 0.0009013625676743686, + "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 @@ -18485,13 +18485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, - "loss": 0.0121, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, - "routers_loss": 0.0028069843538105488, + "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 @@ -18504,13 +18504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, - "loss": 0.0116, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, - "routers_loss": 0.006877045147120953, + "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 @@ -18523,13 +18523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, - "routers_loss": 0.004543667659163475, + "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 @@ -18542,13 +18542,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, - "loss": 0.0232, + "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, - "routers_loss": 0.007053609937429428, + "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 @@ -18561,13 +18561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, - "routers_loss": 0.006783280987292528, + "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 @@ -18580,13 +18580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, - "loss": 0.0181, + "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, - "routers_loss": 0.002531677018851042, + "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 @@ -18599,32 +18599,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, - "loss": 0.0154, + "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, - "routers_loss": 0.027040868997573853, + "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.201937188142061, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, - "loss": 0.0154, - "macro_f1": 0.6601307392120361, + "loss": 0.0158, + "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, - "routers_loss": 0.01573321223258972, + "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 @@ -18637,13 +18637,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, - "routers_loss": 0.02442278526723385, + "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 @@ -18656,13 +18656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, - "loss": 0.0142, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, - "routers_loss": 0.018267054110765457, + "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 @@ -18675,13 +18675,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, - "loss": 0.0162, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, - "routers_loss": 0.0016024474753066897, + "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 @@ -18694,13 +18694,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, - "routers_loss": 0.004668849054723978, + "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 @@ -18713,13 +18713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, - "loss": 0.0098, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, - "routers_loss": 0.0011657609138637781, + "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 @@ -18732,13 +18732,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.04248046875, + "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, - "loss": 0.0133, + "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, - "routers_loss": 0.03806794434785843, + "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 @@ -18751,13 +18751,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, - "loss": 0.0189, + "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, - "routers_loss": 0.005107097327709198, + "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 @@ -18770,13 +18770,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, - "routers_loss": 0.0013696947135031223, + "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 @@ -18789,13 +18789,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, - "loss": 0.0136, + "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, - "routers_loss": 0.0012224154779687524, + "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 @@ -18808,13 +18808,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, - "routers_loss": 0.0030119111761450768, + "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 @@ -18827,13 +18827,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, - "loss": 0.0222, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, - "routers_loss": 0.04286033287644386, + "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 @@ -18846,32 +18846,32 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, - "loss": 0.0158, + "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, - "routers_loss": 0.019988590851426125, + "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.324038743762841, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, - "loss": 0.0154, - "macro_f1": 0.3272727429866791, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, - "routers_loss": 0.012215938419103622, + "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 @@ -18884,13 +18884,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, - "routers_loss": 0.10747655481100082, + "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 @@ -18903,13 +18903,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, - "loss": 0.0186, + "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, - "routers_loss": 0.016109853982925415, + "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 @@ -18922,13 +18922,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, - "loss": 0.0116, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, - "routers_loss": 0.006929324474185705, + "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 @@ -18941,13 +18941,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, - "routers_loss": 0.0715102106332779, + "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 @@ -18960,13 +18960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, - "loss": 0.0187, + "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, - "routers_loss": 0.008499355986714363, + "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 @@ -18979,13 +18979,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, - "loss": 0.0162, + "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, - "routers_loss": 0.012003371492028236, + "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 @@ -18998,13 +18998,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, - "loss": 0.0137, + "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, - "routers_loss": 0.0529167577624321, + "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644 --- a/checkpoint-2000/training_args.bin +++ b/checkpoint-2000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 size 5880 diff --git a/checkpoint-3000/model-00002-of-00002.safetensors b/checkpoint-3000/model-00002-of-00002.safetensors index 90e60903b10ee645ae44e95a07ca692b662c0b11..f35eb1877a2531abd7604388b55f0e2f227e0139 100644 --- a/checkpoint-3000/model-00002-of-00002.safetensors +++ b/checkpoint-3000/model-00002-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:556a7e0a1afc9189ba05912546bcbd5642f962428969c3a6175460e4f7ed088d +oid sha256:74237309fd851d3e6a87c2ecae9fdf046cda24a2b071142d227d3596658c57de size 1481790520 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt index 7cdbe7097aa1559dbc1d224433bc639415e56007..de25de043e1925d01a3a27e8c32e731639eb50cf 100644 --- a/checkpoint-3000/optimizer.pt +++ b/checkpoint-3000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5fc4ae4a4dcddd8241f1b24d63a0e756f40bb65d4eea6c288b5406b68fe3ad1 +oid sha256:0c95beb972e19eb9beaf599780a940fbae8dc2eb2b781515cf6fba5f661673d4 size 44191162 diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json index 71e24cdcfd8eb68dc8d169c82346790853fec879..5b23440931215970ba54e98fb0e391e46eef8b91 100644 --- a/checkpoint-3000/trainer_state.json +++ b/checkpoint-3000/trainer_state.json @@ -12,18 +12,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 25.0, "epoch": 0.009392427355444672, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.40625, + "grad_norm": 2.25, "learning_rate": 2e-06, - "loss": 0.5484, - "macro_f1": 0.1621621698141098, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, - "routers_loss": 0.503563642501831, + "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 @@ -31,18 +31,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 23.0, "epoch": 0.018784854710889344, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.9140625, + "grad_norm": 1.8359375, "learning_rate": 6e-06, - "loss": 0.536, - "macro_f1": 0.1621621698141098, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, - "routers_loss": 0.4589468538761139, + "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 @@ -50,37 +50,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 28.0, "epoch": 0.02817728206633402, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.375, + "grad_norm": 2.234375, "learning_rate": 1e-05, - "loss": 0.5469, - "macro_f1": 0.19999998807907104, + "loss": 0.5113, + "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, - "routers_loss": 0.5736724138259888, + "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { - "acc_repeat": 1.0, - "acc_skip": 0.5, - "avg_layers": 33.0, + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 0.03756970942177869, - "f1_execute": 0.47058823704719543, - "f1_repeat": 0.1538461595773697, - "f1_skip": 0.222222238779068, - "grad_norm": 1.8515625, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, "learning_rate": 1.4e-05, - "loss": 0.5291, - "macro_f1": 0.28221890330314636, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, - "routers_loss": 0.49970296025276184, + "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 @@ -88,37 +88,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.046962136777223364, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.953125, + "grad_norm": 1.78125, "learning_rate": 1.8e-05, - "loss": 0.5316, - "macro_f1": 0.19999998807907104, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, - "routers_loss": 0.5153562426567078, + "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 34.0, + "avg_layers": 26.0, "epoch": 0.05635456413266804, - "f1_execute": 0.5714285373687744, - "f1_repeat": 0.0, - "f1_skip": 0.25, - "grad_norm": 1.6328125, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, "learning_rate": 2.2e-05, - "loss": 0.5051, - "macro_f1": 0.2738095223903656, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, - "routers_loss": 0.46214747428894043, + "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 @@ -126,37 +126,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.06574699148811271, - "f1_execute": 0.5263157486915588, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.6e-05, - "loss": 0.5653, - "macro_f1": 0.17543858289718628, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, - "routers_loss": 0.5300976634025574, + "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 34.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 0.07513941884355738, - "f1_execute": 0.6153846383094788, + "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 1.8828125, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, "learning_rate": 3e-05, - "loss": 0.5225, - "macro_f1": 0.20512822270393372, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, - "routers_loss": 0.473240464925766, + "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 @@ -164,18 +164,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 38.0, + "avg_layers": 27.0, "epoch": 0.08453184619900206, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.6015625, + "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4867, - "macro_f1": 0.19999998807907104, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, - "routers_loss": 0.4795944094657898, + "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 @@ -183,18 +183,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 36.0, + "avg_layers": 26.0, "epoch": 0.09392427355444673, - "f1_execute": 0.6153846383094788, - "f1_repeat": 0.1538461595773697, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, - "grad_norm": 1.3984375, + "grad_norm": 1.3125, "learning_rate": 3.8e-05, - "loss": 0.4718, - "macro_f1": 0.25641027092933655, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, - "routers_loss": 0.41872408986091614, + "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 @@ -202,18 +202,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 26.0, "epoch": 0.1033167009098914, - "f1_execute": 0.6341463327407837, + "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.7734375, + "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, - "loss": 0.4472, - "macro_f1": 0.21138212084770203, + "loss": 0.404, + "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, - "routers_loss": 0.4152105450630188, + "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 @@ -221,18 +221,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 27.0, "epoch": 0.11270912826533608, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.8046875, + "grad_norm": 1.6328125, "learning_rate": 4.6e-05, - "loss": 0.4554, - "macro_f1": 0.19999998807907104, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, - "routers_loss": 0.47541096806526184, + "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 @@ -240,18 +240,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 34.0, + "avg_layers": 27.0, "epoch": 0.12210155562078075, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.875, + "grad_norm": 1.7109375, "learning_rate": 5e-05, - "loss": 0.4182, - "macro_f1": 0.2608695924282074, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, - "routers_loss": 0.37319275736808777, + "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 @@ -259,18 +259,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.13149398297622542, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4375, + "grad_norm": 1.34375, "learning_rate": 5.4e-05, - "loss": 0.3991, - "macro_f1": 0.2608695924282074, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, - "routers_loss": 0.3604123294353485, + "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 @@ -280,16 +280,16 @@ "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, - "f1_execute": 0.8979591727256775, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.421875, + "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, - "loss": 0.3827, - "macro_f1": 0.2993197441101074, + "loss": 0.341, + "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, - "routers_loss": 0.35880225896835327, + "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 @@ -297,18 +297,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 0.15027883768711475, - "f1_execute": 0.9200000166893005, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 6.2e-05, - "loss": 0.3452, - "macro_f1": 0.30666667222976685, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, - "routers_loss": 0.31086465716362, + "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 @@ -316,18 +316,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.15967126504255943, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3671875, + "grad_norm": 1.46875, "learning_rate": 6.6e-05, - "loss": 0.3283, - "macro_f1": 0.3144654333591461, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, - "routers_loss": 0.2674171030521393, + "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 @@ -335,18 +335,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.16906369239800412, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1015625, + "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, - "loss": 0.2849, - "macro_f1": 0.3205128312110901, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, - "routers_loss": 0.24587315320968628, + "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 @@ -354,18 +354,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 27.0, "epoch": 0.17845611975344877, - "f1_execute": 0.8085106015205383, + "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3046875, + "grad_norm": 1.484375, "learning_rate": 7.4e-05, - "loss": 0.2616, - "macro_f1": 0.26950353384017944, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, - "routers_loss": 0.32050269842147827, + "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 @@ -373,18 +373,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.18784854710889345, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1796875, + "grad_norm": 1.3828125, "learning_rate": 7.8e-05, - "loss": 0.2084, - "macro_f1": 0.3144654333591461, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, - "routers_loss": 0.15196125209331512, + "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 @@ -392,18 +392,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.61328125, + "grad_norm": 0.78125, "learning_rate": 8.2e-05, - "loss": 0.1947, + "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, - "routers_loss": 0.14121046662330627, + "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 @@ -416,13 +416,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.50390625, + "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, - "loss": 0.1884, + "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, - "routers_loss": 0.21312278509140015, + "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 @@ -435,13 +435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.45703125, + "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, - "loss": 0.166, + "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, - "routers_loss": 0.1184137836098671, + "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 @@ -454,13 +454,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.62890625, + "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, - "loss": 0.1313, + "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, - "routers_loss": 0.10897563397884369, + "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 @@ -468,18 +468,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.2348106838861168, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.4375, + "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, - "loss": 0.1531, - "macro_f1": 0.3272727429866791, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, - "routers_loss": 0.09979952871799469, + "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 @@ -487,18 +487,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.2442031112415615, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.515625, + "grad_norm": 0.8515625, "learning_rate": 0.000102, - "loss": 0.1265, - "macro_f1": 0.3272727429866791, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, - "routers_loss": 0.05543195456266403, + "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 @@ -511,13 +511,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.328125, + "grad_norm": 0.421875, "learning_rate": 0.000106, - "loss": 0.1436, + "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, - "routers_loss": 0.15049344301223755, + "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 @@ -530,13 +530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.263671875, + "grad_norm": 0.35546875, "learning_rate": 0.00011, - "loss": 0.1021, + "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, - "routers_loss": 0.07367338240146637, + "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 @@ -544,18 +544,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 26.0, "epoch": 0.2723803933078955, - "f1_execute": 1.0, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25, + "grad_norm": 0.271484375, "learning_rate": 0.000114, - "loss": 0.114, - "macro_f1": 0.3333333432674408, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, - "routers_loss": 0.03782692551612854, + "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 @@ -568,13 +568,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.5390625, "learning_rate": 0.000118, - "loss": 0.1197, + "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, - "routers_loss": 0.14074955880641937, + "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 @@ -587,13 +587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2353515625, "learning_rate": 0.000122, - "loss": 0.1174, + "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, - "routers_loss": 0.058013737201690674, + "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 @@ -606,13 +606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.212890625, "learning_rate": 0.000126, - "loss": 0.0911, + "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, - "routers_loss": 0.04936821386218071, + "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 @@ -625,13 +625,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.220703125, + "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, - "loss": 0.1107, + "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, - "routers_loss": 0.2628525495529175, + "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 @@ -644,13 +644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000134, - "loss": 0.1109, + "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, - "routers_loss": 0.02859785594046116, + "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 @@ -663,13 +663,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, - "loss": 0.1067, + "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, - "routers_loss": 0.10459086298942566, + "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 @@ -682,13 +682,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2109375, + "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, - "loss": 0.1166, + "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, - "routers_loss": 0.0718551054596901, + "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 @@ -701,13 +701,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1787109375, "learning_rate": 0.000146, - "loss": 0.1007, + "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, - "routers_loss": 0.1850946843624115, + "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 @@ -720,13 +720,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34375, + "grad_norm": 0.333984375, "learning_rate": 0.00015, - "loss": 0.1019, + "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, - "routers_loss": 0.09809529036283493, + "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 @@ -739,13 +739,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.26171875, "learning_rate": 0.000154, - "loss": 0.1088, + "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, - "routers_loss": 0.11277207732200623, + "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 @@ -758,13 +758,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.255859375, "learning_rate": 0.000158, - "loss": 0.0866, + "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, - "routers_loss": 0.09079254418611526, + "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 @@ -777,13 +777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1943359375, "learning_rate": 0.000162, - "loss": 0.0928, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, - "routers_loss": 0.02900076098740101, + "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 @@ -796,13 +796,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, - "loss": 0.1251, + "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, - "routers_loss": 0.0763339251279831, + "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 @@ -817,11 +817,11 @@ "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, - "loss": 0.1064, + "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, - "routers_loss": 0.13191410899162292, + "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 @@ -834,13 +834,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25390625, "learning_rate": 0.000174, - "loss": 0.1055, + "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, - "routers_loss": 0.21200031042099, + "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 @@ -853,13 +853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.349609375, "learning_rate": 0.000178, - "loss": 0.0971, + "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, - "routers_loss": 0.031911369413137436, + "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 @@ -872,13 +872,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2265625, "learning_rate": 0.000182, - "loss": 0.1056, + "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, - "routers_loss": 0.14131835103034973, + "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 @@ -891,13 +891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.205078125, "learning_rate": 0.000186, - "loss": 0.1059, + "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, - "routers_loss": 0.04137955233454704, + "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 @@ -910,13 +910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.2138671875, "learning_rate": 0.00019, - "loss": 0.0934, + "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, - "routers_loss": 0.03163003921508789, + "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 @@ -929,13 +929,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2099609375, "learning_rate": 0.000194, - "loss": 0.0847, + "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, - "routers_loss": 0.2567490339279175, + "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 @@ -948,13 +948,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30859375, + "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, - "loss": 0.1077, + "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, - "routers_loss": 0.11468870937824249, + "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 @@ -967,13 +967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1806640625, "learning_rate": 0.000202, - "loss": 0.1131, + "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, - "routers_loss": 0.02124219387769699, + "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 @@ -986,13 +986,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1943359375, "learning_rate": 0.000206, - "loss": 0.0624, + "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, - "routers_loss": 0.06983796507120132, + "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 @@ -1005,13 +1005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1591796875, "learning_rate": 0.00021, - "loss": 0.0951, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, - "routers_loss": 0.03467355668544769, + "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 @@ -1024,13 +1024,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.240234375, "learning_rate": 0.000214, - "loss": 0.0881, + "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, - "routers_loss": 0.08142061531543732, + "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 @@ -1043,13 +1043,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.267578125, "learning_rate": 0.000218, - "loss": 0.0795, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, - "routers_loss": 0.08327355235815048, + "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 @@ -1062,13 +1062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.2353515625, "learning_rate": 0.000222, - "loss": 0.0943, + "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, - "routers_loss": 0.019890006631612778, + "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 @@ -1081,13 +1081,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, - "loss": 0.0933, + "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, - "routers_loss": 0.09992363303899765, + "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 @@ -1100,13 +1100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.3046875, "learning_rate": 0.00023, - "loss": 0.0762, + "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, - "routers_loss": 0.014119029976427555, + "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 @@ -1119,13 +1119,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.423828125, + "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, - "loss": 0.0842, + "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, - "routers_loss": 0.03976766765117645, + "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 @@ -1138,13 +1138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, - "loss": 0.0517, + "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, - "routers_loss": 0.017428619787096977, + "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 @@ -1157,13 +1157,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.35546875, "learning_rate": 0.000242, - "loss": 0.1134, + "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, - "routers_loss": 0.06965513527393341, + "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 @@ -1176,13 +1176,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1875, "learning_rate": 0.000246, - "loss": 0.0984, + "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, - "routers_loss": 0.10476501286029816, + "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 @@ -1195,13 +1195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.00025, - "loss": 0.0771, + "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, - "routers_loss": 0.028317544609308243, + "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 @@ -1214,13 +1214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.390625, + "grad_norm": 0.4296875, "learning_rate": 0.000254, - "loss": 0.0933, + "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, - "routers_loss": 0.012766432017087936, + "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 @@ -1233,13 +1233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, - "loss": 0.0989, + "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, - "routers_loss": 0.021400077268481255, + "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 @@ -1252,13 +1252,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.2060546875, "learning_rate": 0.000262, - "loss": 0.0873, + "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, - "routers_loss": 0.05025051161646843, + "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 @@ -1271,13 +1271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1708984375, "learning_rate": 0.000266, - "loss": 0.085, + "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, - "routers_loss": 0.017420046031475067, + "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 @@ -1290,13 +1290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.220703125, "learning_rate": 0.00027, - "loss": 0.086, + "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, - "routers_loss": 0.018217921257019043, + "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 @@ -1309,13 +1309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, - "loss": 0.0985, + "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, - "routers_loss": 0.012350660748779774, + "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 @@ -1328,13 +1328,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, - "routers_loss": 0.14993029832839966, + "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 @@ -1347,13 +1347,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, - "loss": 0.0791, + "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, - "routers_loss": 0.17921413481235504, + "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 @@ -1366,13 +1366,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, - "loss": 0.0535, + "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, - "routers_loss": 0.1420905590057373, + "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 @@ -1385,13 +1385,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.29296875, + "grad_norm": 0.306640625, "learning_rate": 0.00029, - "loss": 0.0956, + "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, - "routers_loss": 0.12468750029802322, + "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 @@ -1404,13 +1404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1806640625, "learning_rate": 0.000294, - "loss": 0.0879, + "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, - "routers_loss": 0.024295611307024956, + "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 @@ -1423,13 +1423,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.000298, - "loss": 0.087, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, - "routers_loss": 0.07016433775424957, + "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 @@ -1442,13 +1442,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3828125, + "grad_norm": 0.37890625, "learning_rate": 0.000302, - "loss": 0.0782, + "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, - "routers_loss": 0.18942493200302124, + "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 @@ -1461,13 +1461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1787109375, "learning_rate": 0.000306, - "loss": 0.0713, + "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, - "routers_loss": 0.02319060079753399, + "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 @@ -1480,13 +1480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15234375, + "grad_norm": 0.1533203125, "learning_rate": 0.00031, - "loss": 0.0778, + "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, - "routers_loss": 0.01764747127890587, + "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 @@ -1499,13 +1499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1572265625, "learning_rate": 0.000314, - "loss": 0.0829, + "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, - "routers_loss": 0.02268100716173649, + "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 @@ -1518,13 +1518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, - "loss": 0.0889, + "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, - "routers_loss": 0.016952091827988625, + "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 @@ -1537,13 +1537,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2216796875, + "grad_norm": 0.224609375, "learning_rate": 0.000322, - "loss": 0.0923, + "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, - "routers_loss": 0.03669808804988861, + "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 @@ -1556,13 +1556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.212890625, "learning_rate": 0.000326, - "loss": 0.0769, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, - "routers_loss": 0.012101447209715843, + "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 @@ -1575,13 +1575,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.37109375, + "grad_norm": 0.408203125, "learning_rate": 0.00033, - "loss": 0.0897, + "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, - "routers_loss": 0.1562056541442871, + "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 @@ -1594,13 +1594,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, - "loss": 0.0829, + "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, - "routers_loss": 0.20807914435863495, + "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 @@ -1613,13 +1613,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, - "loss": 0.0987, + "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, - "routers_loss": 0.1530539095401764, + "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 @@ -1632,13 +1632,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.17578125, "learning_rate": 0.000342, - "loss": 0.087, + "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, - "routers_loss": 0.08004544675350189, + "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 @@ -1651,13 +1651,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.189453125, "learning_rate": 0.000346, - "loss": 0.0916, + "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, - "routers_loss": 0.19228078424930573, + "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 @@ -1670,13 +1670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1494140625, "learning_rate": 0.00035, - "loss": 0.0863, + "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, - "routers_loss": 0.024507170543074608, + "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 @@ -1689,13 +1689,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2158203125, "learning_rate": 0.000354, - "loss": 0.0898, + "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, - "routers_loss": 0.05055495724081993, + "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 @@ -1708,13 +1708,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.240234375, "learning_rate": 0.000358, - "loss": 0.0865, + "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, - "routers_loss": 0.03999815881252289, + "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 @@ -1727,13 +1727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.14453125, "learning_rate": 0.000362, - "loss": 0.0983, + "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, - "routers_loss": 0.025158070027828217, + "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 @@ -1746,32 +1746,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.158203125, "learning_rate": 0.000366, - "loss": 0.1015, + "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, - "routers_loss": 0.01825365424156189, + "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 0.8734957440563546, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, "learning_rate": 0.00037, - "loss": 0.0736, - "macro_f1": 0.3144654333591461, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, - "routers_loss": 0.22729666531085968, + "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 @@ -1784,13 +1784,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2041015625, "learning_rate": 0.000374, - "loss": 0.0838, + "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, - "routers_loss": 0.24516475200653076, + "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 @@ -1803,13 +1803,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2470703125, + "grad_norm": 0.271484375, "learning_rate": 0.000378, - "loss": 0.1056, + "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, - "routers_loss": 0.1307530701160431, + "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 @@ -1822,13 +1822,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15625, "learning_rate": 0.000382, - "loss": 0.0961, + "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, - "routers_loss": 0.06541688740253448, + "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 @@ -1841,13 +1841,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.34375, "learning_rate": 0.000386, - "loss": 0.1058, + "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, - "routers_loss": 0.12492545694112778, + "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 @@ -1860,13 +1860,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28515625, + "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, - "loss": 0.0966, + "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, - "routers_loss": 0.2838033139705658, + "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 @@ -1881,11 +1881,11 @@ "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, - "loss": 0.0929, + "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, - "routers_loss": 0.07692629098892212, + "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 @@ -1898,13 +1898,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, - "routers_loss": 0.18504399061203003, + "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 @@ -1917,13 +1917,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2490234375, "learning_rate": 0.000402, - "loss": 0.078, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, - "routers_loss": 0.014647359028458595, + "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 @@ -1936,13 +1936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, - "loss": 0.1028, + "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, - "routers_loss": 0.017848484218120575, + "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 @@ -1955,13 +1955,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.27734375, "learning_rate": 0.00041, - "loss": 0.0832, + "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, - "routers_loss": 0.01900508813560009, + "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 @@ -1974,13 +1974,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, - "routers_loss": 0.13018715381622314, + "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 @@ -1993,13 +1993,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, - "loss": 0.0697, + "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, - "routers_loss": 0.055288366973400116, + "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 @@ -2012,13 +2012,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.271484375, "learning_rate": 0.000422, - "loss": 0.0576, + "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, - "routers_loss": 0.10952572524547577, + "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 @@ -2031,13 +2031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.197265625, + "grad_norm": 0.2060546875, "learning_rate": 0.000426, - "loss": 0.062, + "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, - "routers_loss": 0.02415696159005165, + "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 @@ -2050,13 +2050,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.00043, - "loss": 0.1011, + "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, - "routers_loss": 0.06956391036510468, + "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 @@ -2069,13 +2069,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, - "loss": 0.076, + "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, - "routers_loss": 0.1140352189540863, + "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 @@ -2090,11 +2090,11 @@ "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, - "loss": 0.0788, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, - "routers_loss": 0.011621571145951748, + "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 @@ -2107,13 +2107,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, - "routers_loss": 0.05813701078295708, + "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 @@ -2126,13 +2126,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.40234375, "learning_rate": 0.000446, - "loss": 0.0827, + "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, - "routers_loss": 0.0646885335445404, + "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 @@ -2145,13 +2145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, - "loss": 0.1011, + "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, - "routers_loss": 0.07224348932504654, + "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 @@ -2164,13 +2164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, - "loss": 0.0781, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, - "routers_loss": 0.015971746295690536, + "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 @@ -2183,13 +2183,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25, "learning_rate": 0.000458, - "loss": 0.099, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, - "routers_loss": 0.017818331718444824, + "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 @@ -2202,13 +2202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1494140625, "learning_rate": 0.000462, - "loss": 0.0757, + "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, - "routers_loss": 0.01582280732691288, + "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 @@ -2221,13 +2221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.42578125, + "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, - "loss": 0.0876, + "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, - "routers_loss": 0.011417915113270283, + "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 @@ -2240,13 +2240,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.17578125, "learning_rate": 0.00047, - "loss": 0.0801, + "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, - "routers_loss": 0.05787832289934158, + "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 @@ -2259,13 +2259,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.267578125, "learning_rate": 0.000474, - "loss": 0.0508, + "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, - "routers_loss": 0.09476690739393234, + "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 @@ -2278,13 +2278,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, - "loss": 0.0833, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, - "routers_loss": 0.1099705696105957, + "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 @@ -2297,13 +2297,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.171875, "learning_rate": 0.000482, - "loss": 0.0745, + "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, - "routers_loss": 0.01269970741122961, + "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 @@ -2316,13 +2316,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.12060546875, "learning_rate": 0.000486, - "loss": 0.061, + "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, - "routers_loss": 0.08505752682685852, + "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 @@ -2335,13 +2335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1552734375, "learning_rate": 0.00049, - "loss": 0.0504, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, - "routers_loss": 0.012750142253935337, + "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 @@ -2354,13 +2354,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.291015625, + "grad_norm": 0.296875, "learning_rate": 0.000494, - "loss": 0.0962, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, - "routers_loss": 0.11287309974431992, + "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 @@ -2373,32 +2373,32 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.203125, "learning_rate": 0.000498, - "loss": 0.0821, + "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, - "routers_loss": 0.1486474722623825, + "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.183152333431171, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, + "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, - "loss": 0.0832, - "macro_f1": 0.5492662787437439, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, - "routers_loss": 0.06636594980955124, + "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 @@ -2411,13 +2411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.267578125, + "grad_norm": 0.287109375, "learning_rate": 0.000506, - "loss": 0.1, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, - "routers_loss": 0.015062150545418262, + "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 @@ -2430,13 +2430,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.21484375, "learning_rate": 0.00051, - "loss": 0.0808, + "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, - "routers_loss": 0.2051105946302414, + "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 @@ -2449,13 +2449,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.2421875, "learning_rate": 0.000514, - "loss": 0.068, + "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, - "routers_loss": 0.1467045396566391, + "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 @@ -2468,13 +2468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1533203125, "learning_rate": 0.000518, - "loss": 0.0543, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, - "routers_loss": 0.013022038154304028, + "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 @@ -2487,13 +2487,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2294921875, "learning_rate": 0.000522, - "loss": 0.0848, + "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, - "routers_loss": 0.2575930058956146, + "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 @@ -2506,13 +2506,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.17578125, "learning_rate": 0.000526, - "loss": 0.07, + "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, - "routers_loss": 0.0558602549135685, + "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 @@ -2525,13 +2525,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, - "loss": 0.082, + "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, - "routers_loss": 0.09126655012369156, + "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 @@ -2544,13 +2544,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, - "loss": 0.0764, + "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, - "routers_loss": 0.24805288016796112, + "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 @@ -2563,13 +2563,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, - "loss": 0.0686, + "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, - "routers_loss": 0.13135533034801483, + "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 @@ -2582,13 +2582,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, - "loss": 0.1083, + "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, - "routers_loss": 0.04991440102458, + "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 @@ -2601,13 +2601,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.455078125, + "grad_norm": 0.44921875, "learning_rate": 0.000546, - "loss": 0.0991, + "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, - "routers_loss": 0.12236632406711578, + "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 @@ -2620,13 +2620,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25, + "grad_norm": 0.2578125, "learning_rate": 0.00055, - "loss": 0.0936, + "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, - "routers_loss": 0.053506772965192795, + "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 @@ -2639,13 +2639,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.29296875, "learning_rate": 0.000554, - "loss": 0.066, + "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, - "routers_loss": 0.13446088135242462, + "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 @@ -2658,32 +2658,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.185546875, "learning_rate": 0.000558, - "loss": 0.0682, + "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, - "routers_loss": 0.07270720601081848, + "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.3240387437628411, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.28125, + "f1_skip": 0.0, + "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, - "loss": 0.0648, - "macro_f1": 0.5427350401878357, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, - "routers_loss": 0.13866399228572845, + "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 @@ -2696,13 +2696,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.265625, "learning_rate": 0.000566, - "loss": 0.0782, + "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, - "routers_loss": 0.0645354762673378, + "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 @@ -2715,13 +2715,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1650390625, "learning_rate": 0.00057, - "loss": 0.0892, + "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, - "routers_loss": 0.05967628210783005, + "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 @@ -2734,13 +2734,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2158203125, "learning_rate": 0.000574, - "loss": 0.0676, + "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, - "routers_loss": 0.06438407301902771, + "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 @@ -2753,13 +2753,13 @@ "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28515625, "learning_rate": 0.000578, - "loss": 0.0781, + "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, - "routers_loss": 0.21225209534168243, + "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 @@ -2772,13 +2772,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, - "loss": 0.0664, + "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, - "routers_loss": 0.08085516840219498, + "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 @@ -2791,13 +2791,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, - "loss": 0.0874, + "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, - "routers_loss": 0.05378658324480057, + "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 @@ -2810,13 +2810,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.2177734375, "learning_rate": 0.00059, - "loss": 0.0715, + "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, - "routers_loss": 0.01145261898636818, + "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 @@ -2831,11 +2831,11 @@ "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, - "loss": 0.0737, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, - "routers_loss": 0.009397956542670727, + "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 @@ -2848,13 +2848,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.181640625, "learning_rate": 0.000598, - "loss": 0.0802, + "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, - "routers_loss": 0.2389357089996338, + "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 @@ -2862,18 +2862,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 1.417963017317288, - "f1_execute": 0.9019607901573181, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2080078125, "learning_rate": 0.000602, - "loss": 0.0745, - "macro_f1": 0.3006536066532135, + "loss": 0.073, + "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, - "routers_loss": 0.18252353370189667, + "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 @@ -2886,13 +2886,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.27734375, + "grad_norm": 0.279296875, "learning_rate": 0.000606, - "loss": 0.0935, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, - "routers_loss": 0.18185268342494965, + "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 @@ -2905,13 +2905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1943359375, "learning_rate": 0.00061, - "loss": 0.0853, + "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, - "routers_loss": 0.013210167177021503, + "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 @@ -2924,13 +2924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.26953125, "learning_rate": 0.000614, - "loss": 0.1089, + "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, - "routers_loss": 0.016936838626861572, + "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 @@ -2943,13 +2943,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, - "loss": 0.077, + "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, - "routers_loss": 0.08630389720201492, + "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 @@ -2962,13 +2962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.19140625, "learning_rate": 0.000622, - "loss": 0.0602, + "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, - "routers_loss": 0.013665963895618916, + "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 @@ -2981,13 +2981,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.205078125, "learning_rate": 0.000626, - "loss": 0.0794, + "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, - "routers_loss": 0.01584783010184765, + "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 @@ -3000,13 +3000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2216796875, "learning_rate": 0.00063, - "loss": 0.0762, + "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, - "routers_loss": 0.01368923019617796, + "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 @@ -3019,13 +3019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.388671875, + "grad_norm": 0.400390625, "learning_rate": 0.000634, - "loss": 0.0908, + "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, - "routers_loss": 0.009135022759437561, + "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 @@ -3038,13 +3038,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.15234375, "learning_rate": 0.000638, - "loss": 0.0949, + "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, - "routers_loss": 0.046641621738672256, + "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 @@ -3052,18 +3052,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.5118872908717347, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.259765625, "learning_rate": 0.000642, - "loss": 0.0925, - "macro_f1": 0.3333333432674408, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, - "routers_loss": 0.020637936890125275, + "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 @@ -3076,13 +3076,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26953125, + "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, - "routers_loss": 0.08289298415184021, + "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 @@ -3090,18 +3090,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.530672145582624, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, - "loss": 0.0823, - "macro_f1": 0.3272727429866791, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, - "routers_loss": 0.06960040330886841, + "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 @@ -3114,13 +3114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, - "loss": 0.0799, + "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, - "routers_loss": 0.02087482251226902, + "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 @@ -3133,13 +3133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, - "loss": 0.0757, + "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, - "routers_loss": 0.016592051833868027, + "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 @@ -3152,32 +3152,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.220703125, "learning_rate": 0.000662, - "loss": 0.0438, + "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, - "routers_loss": 0.012950568459928036, + "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 1.5682418550044028, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.296875, "learning_rate": 0.000666, - "loss": 0.0964, - "macro_f1": 0.29333335161209106, + "loss": 0.0963, + "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, - "routers_loss": 0.3373340964317322, + "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 @@ -3190,13 +3190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, - "routers_loss": 0.008110735565423965, + "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 @@ -3209,13 +3209,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.2421875, "learning_rate": 0.000674, - "loss": 0.0771, + "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, - "routers_loss": 0.01841609925031662, + "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 @@ -3228,13 +3228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, - "loss": 0.0894, + "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, - "routers_loss": 0.01612614095211029, + "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 @@ -3247,13 +3247,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, - "loss": 0.0611, + "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, - "routers_loss": 0.26202192902565, + "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 @@ -3266,13 +3266,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, - "loss": 0.1013, + "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, - "routers_loss": 0.09235779196023941, + "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 @@ -3285,13 +3285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.255859375, "learning_rate": 0.00069, - "loss": 0.0856, + "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, - "routers_loss": 0.010735333897173405, + "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 @@ -3304,13 +3304,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2138671875, "learning_rate": 0.000694, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, - "routers_loss": 0.14742356538772583, + "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 @@ -3323,13 +3323,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30859375, + "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, - "loss": 0.0614, + "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, - "routers_loss": 0.06606879830360413, + "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 @@ -3342,13 +3342,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.322265625, + "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, - "loss": 0.1033, + "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, - "routers_loss": 0.012873432599008083, + "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 @@ -3361,13 +3361,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, - "loss": 0.0819, + "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, - "routers_loss": 0.07853665202856064, + "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 @@ -3380,13 +3380,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.263671875, "learning_rate": 0.00071, - "loss": 0.0804, + "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, - "routers_loss": 0.2216549813747406, + "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 @@ -3399,13 +3399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1884765625, "learning_rate": 0.000714, - "loss": 0.0675, + "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, - "routers_loss": 0.02423691377043724, + "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 @@ -3413,18 +3413,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.6903434106251836, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.21484375, "learning_rate": 0.000718, - "loss": 0.0781, - "macro_f1": 0.3272727429866791, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, - "routers_loss": 0.07496294379234314, + "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 @@ -3437,13 +3437,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.197265625, "learning_rate": 0.000722, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, - "routers_loss": 0.08181872963905334, + "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 @@ -3456,13 +3456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2216796875, "learning_rate": 0.000726, - "loss": 0.1112, + "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, - "routers_loss": 0.016959719359874725, + "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 @@ -3475,13 +3475,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.15625, "learning_rate": 0.00073, - "loss": 0.0577, + "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, - "routers_loss": 0.13295969367027283, + "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 @@ -3494,13 +3494,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.150390625, "learning_rate": 0.000734, - "loss": 0.0986, + "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, - "routers_loss": 0.02476893551647663, + "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 @@ -3513,13 +3513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1796875, "learning_rate": 0.000738, - "loss": 0.0682, + "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, - "routers_loss": 0.019863395020365715, + "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 @@ -3532,13 +3532,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2412109375, "learning_rate": 0.000742, - "loss": 0.0663, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, - "routers_loss": 0.07230417430400848, + "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 @@ -3551,13 +3551,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2412109375, "learning_rate": 0.000746, - "loss": 0.0986, + "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, - "routers_loss": 0.11727793514728546, + "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 @@ -3570,13 +3570,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2265625, "learning_rate": 0.00075, - "loss": 0.0724, + "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, - "routers_loss": 0.13495951890945435, + "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 @@ -3589,13 +3589,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.2333984375, "learning_rate": 0.000754, - "loss": 0.0823, + "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, - "routers_loss": 0.07612533867359161, + "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 @@ -3608,13 +3608,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1826171875, "learning_rate": 0.000758, - "loss": 0.0803, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, - "routers_loss": 0.0484120175242424, + "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 @@ -3627,13 +3627,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1689453125, "learning_rate": 0.000762, - "loss": 0.0866, + "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, - "routers_loss": 0.10939671844244003, + "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 @@ -3646,13 +3646,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, - "loss": 0.1083, + "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, - "routers_loss": 0.11382336914539337, + "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 @@ -3667,11 +3667,11 @@ "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, - "loss": 0.0616, + "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, - "routers_loss": 0.07494530081748962, + "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 @@ -3684,13 +3684,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, - "loss": 0.0816, + "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, - "routers_loss": 0.05718417093157768, + "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 @@ -3703,13 +3703,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.2099609375, "learning_rate": 0.000778, - "loss": 0.0783, + "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, - "routers_loss": 0.2848989963531494, + "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 @@ -3722,13 +3722,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30078125, + "grad_norm": 0.30859375, "learning_rate": 0.000782, - "loss": 0.0608, + "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, - "routers_loss": 0.2050076276063919, + "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 @@ -3741,13 +3741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.29296875, "learning_rate": 0.000786, - "loss": 0.0863, + "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, - "routers_loss": 0.020946886390447617, + "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 @@ -3760,13 +3760,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.376953125, + "grad_norm": 0.37890625, "learning_rate": 0.00079, - "loss": 0.0798, + "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, - "routers_loss": 0.1270289123058319, + "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 @@ -3779,13 +3779,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, - "loss": 0.0701, + "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, - "routers_loss": 0.08012636005878448, + "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 @@ -3798,13 +3798,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, - "loss": 0.0901, + "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, - "routers_loss": 0.09315784275531769, + "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 @@ -3817,13 +3817,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, - "loss": 0.078, + "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, - "routers_loss": 0.18492189049720764, + "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 @@ -3836,13 +3836,13 @@ "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, - "loss": 0.0801, + "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, - "routers_loss": 0.32641324400901794, + "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 @@ -3855,13 +3855,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, - "loss": 0.0905, + "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, - "routers_loss": 0.02722037397325039, + "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 @@ -3874,13 +3874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, - "loss": 0.0958, + "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, - "routers_loss": 0.010129833593964577, + "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 @@ -3893,13 +3893,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2373046875, + "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, - "loss": 0.1084, + "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, - "routers_loss": 0.07298308610916138, + "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 @@ -3912,13 +3912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, - "loss": 0.0802, + "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, - "routers_loss": 0.024257874116301537, + "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 @@ -3931,13 +3931,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1650390625, "learning_rate": 0.000826, - "loss": 0.0842, + "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, - "routers_loss": 0.048864223062992096, + "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 @@ -3950,13 +3950,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1513671875, "learning_rate": 0.00083, - "loss": 0.1026, + "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, - "routers_loss": 0.1592330038547516, + "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 @@ -3969,13 +3969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.263671875, "learning_rate": 0.000834, - "loss": 0.0963, + "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, - "routers_loss": 0.02291976846754551, + "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 @@ -3988,13 +3988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10888671875, "learning_rate": 0.000838, - "loss": 0.0634, + "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, - "routers_loss": 0.010272650048136711, + "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 @@ -4007,13 +4007,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.263671875, "learning_rate": 0.000842, - "loss": 0.0786, + "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, - "routers_loss": 0.0692613497376442, + "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 @@ -4026,13 +4026,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.1318359375, "learning_rate": 0.000846, - "loss": 0.0706, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, - "routers_loss": 0.12713804841041565, + "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 @@ -4045,13 +4045,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2158203125, "learning_rate": 0.00085, - "loss": 0.0758, + "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, - "routers_loss": 0.08670130372047424, + "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 @@ -4064,13 +4064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.23828125, "learning_rate": 0.000854, - "loss": 0.0857, + "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, - "routers_loss": 0.01053862925618887, + "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 @@ -4083,13 +4083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1435546875, "learning_rate": 0.000858, - "loss": 0.0615, + "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, - "routers_loss": 0.012946994043886662, + "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 @@ -4102,13 +4102,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1552734375, "learning_rate": 0.000862, - "loss": 0.0498, + "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, - "routers_loss": 0.08222822099924088, + "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 @@ -4121,13 +4121,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.173828125, "learning_rate": 0.000866, - "loss": 0.0532, + "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, - "routers_loss": 0.07086442410945892, + "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 @@ -4140,13 +4140,13 @@ "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1943359375, "learning_rate": 0.00087, - "loss": 0.0825, + "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, - "routers_loss": 0.29007306694984436, + "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 @@ -4159,13 +4159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.396484375, + "grad_norm": 0.423828125, "learning_rate": 0.000874, - "loss": 0.0658, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, - "routers_loss": 0.014652491547167301, + "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 @@ -4178,13 +4178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.000878, - "loss": 0.0685, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, - "routers_loss": 0.013720969669520855, + "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 @@ -4197,13 +4197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.171875, "learning_rate": 0.000882, - "loss": 0.0771, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, - "routers_loss": 0.011687638238072395, + "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 @@ -4216,13 +4216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, - "loss": 0.0604, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, - "routers_loss": 0.007869532331824303, + "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 @@ -4230,18 +4230,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.0939242735544465, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, - "loss": 0.0797, - "macro_f1": 0.3076923191547394, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, - "routers_loss": 0.3034668564796448, + "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 @@ -4254,13 +4254,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.203125, "learning_rate": 0.000894, - "loss": 0.0823, + "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, - "routers_loss": 0.11066079139709473, + "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 @@ -4273,13 +4273,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.33984375, "learning_rate": 0.000898, - "loss": 0.0773, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, - "routers_loss": 0.0755370482802391, + "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 @@ -4292,13 +4292,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.3203125, "learning_rate": 0.000902, - "loss": 0.0596, + "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, - "routers_loss": 0.08470689505338669, + "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 @@ -4311,13 +4311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.1953125, "learning_rate": 0.000906, - "loss": 0.0608, + "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, - "routers_loss": 0.0130238626152277, + "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 @@ -4330,13 +4330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.00091, - "loss": 0.0652, + "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, - "routers_loss": 0.007108641788363457, + "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 @@ -4351,11 +4351,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, - "loss": 0.0746, + "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, - "routers_loss": 0.06834109872579575, + "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 @@ -4368,13 +4368,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, - "loss": 0.0733, + "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, - "routers_loss": 0.10230778902769089, + "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 @@ -4387,13 +4387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, - "loss": 0.0528, + "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, - "routers_loss": 0.009987542405724525, + "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 @@ -4406,13 +4406,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, - "loss": 0.0536, + "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, - "routers_loss": 0.03448869287967682, + "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 @@ -4425,13 +4425,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.173828125, "learning_rate": 0.00093, - "loss": 0.053, + "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, - "routers_loss": 0.13631699979305267, + "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 @@ -4444,13 +4444,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.142578125, "learning_rate": 0.000934, - "loss": 0.06, + "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, - "routers_loss": 0.053951870650053024, + "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 @@ -4463,13 +4463,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, - "loss": 0.059, + "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, - "routers_loss": 0.14479905366897583, + "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 @@ -4482,13 +4482,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.44140625, + "grad_norm": 0.5, "learning_rate": 0.000942, - "loss": 0.0913, + "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, - "routers_loss": 0.056221429258584976, + "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 @@ -4501,13 +4501,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.212890625, "learning_rate": 0.000946, - "loss": 0.0591, + "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, - "routers_loss": 0.09729792177677155, + "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 @@ -4520,13 +4520,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1259765625, "learning_rate": 0.00095, - "loss": 0.0496, + "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, - "routers_loss": 0.029447713866829872, + "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 @@ -4539,13 +4539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.271484375, + "grad_norm": 0.291015625, "learning_rate": 0.000954, - "loss": 0.0801, + "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, - "routers_loss": 0.09337342530488968, + "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 @@ -4560,11 +4560,11 @@ "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, - "loss": 0.1102, + "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, - "routers_loss": 0.23193210363388062, + "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 @@ -4572,18 +4572,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.2629879659524508, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.146484375, "learning_rate": 0.000962, - "loss": 0.0669, - "macro_f1": 0.3272727429866791, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, - "routers_loss": 0.046257760375738144, + "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 @@ -4596,13 +4596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.197265625, "learning_rate": 0.000966, - "loss": 0.0552, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, - "routers_loss": 0.01683143898844719, + "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 @@ -4615,13 +4615,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, - "loss": 0.071, + "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, - "routers_loss": 0.04129387438297272, + "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 @@ -4634,13 +4634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000974, - "loss": 0.0605, + "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, - "routers_loss": 0.01262948103249073, + "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 @@ -4653,13 +4653,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.263671875, "learning_rate": 0.000978, - "loss": 0.081, + "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, - "routers_loss": 0.07404553890228271, + "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 @@ -4672,13 +4672,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2158203125, "learning_rate": 0.000982, - "loss": 0.0751, + "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, - "routers_loss": 0.06795930862426758, + "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 @@ -4691,13 +4691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, - "loss": 0.0804, + "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, - "routers_loss": 0.02233024686574936, + "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 @@ -4710,13 +4710,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2138671875, "learning_rate": 0.00099, - "loss": 0.0731, + "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, - "routers_loss": 0.07979031652212143, + "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 @@ -4729,13 +4729,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1298828125, + "grad_norm": 0.130859375, "learning_rate": 0.000994, - "loss": 0.0795, + "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, - "routers_loss": 0.045646365731954575, + "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 @@ -4748,13 +4748,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, - "routers_loss": 0.09717849642038345, + "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 @@ -4767,13 +4767,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30078125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, - "loss": 0.0894, + "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, - "routers_loss": 0.03948225453495979, + "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 @@ -4786,13 +4786,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, - "loss": 0.0557, + "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, - "routers_loss": 0.0742638111114502, + "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 @@ -4805,13 +4805,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.25, "learning_rate": 0.000999999401247153, - "loss": 0.0682, + "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, - "routers_loss": 0.08293049037456512, + "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 @@ -4824,13 +4824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, - "loss": 0.0697, + "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, - "routers_loss": 0.010080376639962196, + "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 @@ -4843,13 +4843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, - "loss": 0.0611, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, - "routers_loss": 0.009179878048598766, + "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 @@ -4862,13 +4862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11083984375, + "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, - "loss": 0.0689, + "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, - "routers_loss": 0.006718529388308525, + "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 @@ -4881,13 +4881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, - "loss": 0.0826, + "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, - "routers_loss": 0.049344487488269806, + "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 @@ -4900,13 +4900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, - "loss": 0.0739, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, - "routers_loss": 0.013402626849710941, + "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 @@ -4919,13 +4919,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, - "loss": 0.0761, + "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, - "routers_loss": 0.16964484751224518, + "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 @@ -4938,13 +4938,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, - "loss": 0.095, + "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, - "routers_loss": 0.08609295636415482, + "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 @@ -4957,13 +4957,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2392578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, - "loss": 0.0816, + "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, - "routers_loss": 0.05354784056544304, + "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 @@ -4976,13 +4976,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.2236328125, + "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, - "loss": 0.0715, + "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, - "routers_loss": 0.09146631509065628, + "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 @@ -4995,13 +4995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, - "loss": 0.0574, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, - "routers_loss": 0.02344255894422531, + "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 @@ -5014,13 +5014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, - "loss": 0.0621, + "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, - "routers_loss": 0.018271517008543015, + "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 @@ -5033,13 +5033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, - "loss": 0.0717, + "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, - "routers_loss": 0.026990914717316628, + "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 @@ -5052,13 +5052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, - "loss": 0.0681, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, - "routers_loss": 0.019737249240279198, + "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 @@ -5071,13 +5071,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.3046875, + "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, - "loss": 0.0978, + "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, - "routers_loss": 0.212640181183815, + "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 @@ -5090,13 +5090,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, - "loss": 0.0602, + "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, - "routers_loss": 0.07302755117416382, + "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 @@ -5109,13 +5109,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, - "loss": 0.0825, + "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, - "routers_loss": 0.08667246252298355, + "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 @@ -5128,13 +5128,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, - "loss": 0.0597, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, - "routers_loss": 0.015047167427837849, + "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 @@ -5147,13 +5147,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, - "loss": 0.076, + "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, - "routers_loss": 0.07481446117162704, + "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 @@ -5166,13 +5166,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, - "loss": 0.0724, + "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, - "routers_loss": 0.049495212733745575, + "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 @@ -5185,13 +5185,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, - "loss": 0.0718, + "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, - "routers_loss": 0.08043002337217331, + "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 @@ -5204,13 +5204,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34765625, + "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, - "loss": 0.086, + "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, - "routers_loss": 0.22461950778961182, + "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 @@ -5223,13 +5223,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, - "loss": 0.0798, + "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, - "routers_loss": 0.11754962801933289, + "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 @@ -5242,13 +5242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, - "loss": 0.0978, + "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, - "routers_loss": 0.017412789165973663, + "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 @@ -5261,13 +5261,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, - "loss": 0.0792, + "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, - "routers_loss": 0.08969525247812271, + "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 @@ -5280,13 +5280,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, - "loss": 0.0803, + "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, - "routers_loss": 0.09876437485218048, + "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 @@ -5299,13 +5299,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, - "loss": 0.0887, + "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, - "routers_loss": 0.019108204171061516, + "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 @@ -5318,32 +5318,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, - "loss": 0.0573, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, - "routers_loss": 0.019048813730478287, + "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 2.638685060170238, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "f1_skip": 0.5, + "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, - "loss": 0.0947, - "macro_f1": 0.3144654333591461, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, - "routers_loss": 0.11889495700597763, + "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 @@ -5356,13 +5356,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, - "loss": 0.0771, + "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, - "routers_loss": 0.06202332302927971, + "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 @@ -5375,13 +5375,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, - "loss": 0.0623, + "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, - "routers_loss": 0.08294244855642319, + "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 @@ -5394,13 +5394,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, - "loss": 0.0885, + "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, - "routers_loss": 0.07545182853937149, + "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 @@ -5413,13 +5413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, - "loss": 0.0712, + "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, - "routers_loss": 0.008711219765245914, + "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 @@ -5432,13 +5432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, - "loss": 0.0837, + "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, - "routers_loss": 0.01639273390173912, + "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 @@ -5451,13 +5451,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, - "loss": 0.0707, + "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, - "routers_loss": 0.04997137933969498, + "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 @@ -5470,13 +5470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, - "loss": 0.0799, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, - "routers_loss": 0.011360209435224533, + "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 @@ -5489,13 +5489,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, - "loss": 0.0658, + "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, - "routers_loss": 0.31349560618400574, + "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 @@ -5508,13 +5508,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, - "loss": 0.0801, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, - "routers_loss": 0.058660347014665604, + "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 @@ -5527,13 +5527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, - "loss": 0.0578, + "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, - "routers_loss": 0.025836754590272903, + "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 @@ -5546,13 +5546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, - "loss": 0.0683, + "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, - "routers_loss": 0.016504868865013123, + "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 @@ -5565,13 +5565,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, - "loss": 0.0685, + "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, - "routers_loss": 0.1379794180393219, + "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 @@ -5584,13 +5584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, - "loss": 0.0657, + "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, - "routers_loss": 0.01802757754921913, + "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 @@ -5603,13 +5603,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, - "loss": 0.0762, + "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, - "routers_loss": 0.006902900990098715, + "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 @@ -5622,13 +5622,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, - "loss": 0.0912, + "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, - "routers_loss": 0.08348741382360458, + "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 @@ -5641,13 +5641,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, - "loss": 0.0527, + "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, - "routers_loss": 0.08290062099695206, + "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 @@ -5660,13 +5660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, - "loss": 0.0643, + "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, - "routers_loss": 0.018620988354086876, + "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 @@ -5679,13 +5679,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, - "loss": 0.073, + "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, - "routers_loss": 0.1211671382188797, + "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 @@ -5698,13 +5698,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, - "loss": 0.079, + "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, - "routers_loss": 0.15485027432441711, + "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 @@ -5717,13 +5717,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, - "loss": 0.0562, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, - "routers_loss": 0.036684274673461914, + "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 @@ -5731,18 +5731,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.835926034634576, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, - "loss": 0.0985, - "macro_f1": 0.3333333432674408, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, - "routers_loss": 0.026901578530669212, + "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 @@ -5755,13 +5755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, - "loss": 0.0632, + "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, - "routers_loss": 0.01700405217707157, + "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 @@ -5774,13 +5774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, - "loss": 0.0758, + "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, - "routers_loss": 0.015013590455055237, + "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 @@ -5793,13 +5793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, - "loss": 0.0576, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, - "routers_loss": 0.02037946693599224, + "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 @@ -5812,13 +5812,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, - "loss": 0.0648, + "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, - "routers_loss": 0.22834022343158722, + "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 @@ -5831,13 +5831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, - "loss": 0.0449, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, - "routers_loss": 0.009838113561272621, + "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 @@ -5850,13 +5850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, - "loss": 0.1009, + "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, - "routers_loss": 0.005943270865827799, + "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 @@ -5869,13 +5869,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.38671875, + "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, - "loss": 0.0941, + "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, - "routers_loss": 0.21597740054130554, + "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 @@ -5888,13 +5888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, - "loss": 0.0896, + "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, - "routers_loss": 0.023726588115096092, + "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 @@ -5907,13 +5907,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, - "routers_loss": 0.08467255532741547, + "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 @@ -5926,13 +5926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, - "loss": 0.0816, + "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, - "routers_loss": 0.029468854889273643, + "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 @@ -5945,13 +5945,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, - "loss": 0.0891, + "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, - "routers_loss": 0.09438466280698776, + "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 @@ -5964,13 +5964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, - "loss": 0.0581, + "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, - "routers_loss": 0.013679586350917816, + "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 @@ -5983,13 +5983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, - "loss": 0.0528, + "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, - "routers_loss": 0.029532987624406815, + "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 @@ -6002,13 +6002,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, - "loss": 0.0601, + "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, - "routers_loss": 0.05516733601689339, + "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 @@ -6021,13 +6021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, - "loss": 0.0785, + "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, - "routers_loss": 0.010254866443574429, + "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 @@ -6040,13 +6040,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, - "loss": 0.0518, + "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, - "routers_loss": 0.07528360933065414, + "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 @@ -6059,13 +6059,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1064453125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, - "loss": 0.0844, + "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, - "routers_loss": 0.04301584139466286, + "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 @@ -6078,13 +6078,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, - "loss": 0.0699, + "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, - "routers_loss": 0.14521080255508423, + "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 @@ -6097,13 +6097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, - "loss": 0.0543, + "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, - "routers_loss": 0.01074797473847866, + "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 @@ -6116,13 +6116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, - "loss": 0.0659, + "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, - "routers_loss": 0.009271817281842232, + "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 @@ -6135,13 +6135,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, - "loss": 0.0737, + "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, - "routers_loss": 0.10257050395011902, + "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 @@ -6154,13 +6154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, - "loss": 0.0363, + "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, - "routers_loss": 0.07091924548149109, + "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 @@ -6173,13 +6173,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, - "loss": 0.0421, + "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, - "routers_loss": 0.034446243196725845, + "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 @@ -6192,13 +6192,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, - "loss": 0.0593, + "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, - "routers_loss": 0.06077485531568527, + "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 @@ -6211,13 +6211,13 @@ "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2294921875, + "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, - "loss": 0.0537, + "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, - "routers_loss": 0.2382282167673111, + "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 @@ -6230,13 +6230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, - "loss": 0.0613, + "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, - "routers_loss": 0.011971636675298214, + "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 @@ -6249,13 +6249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.212890625, + "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, - "loss": 0.0414, + "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, - "routers_loss": 0.010221127420663834, + "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 @@ -6268,13 +6268,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2265625, + "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, - "loss": 0.061, + "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, - "routers_loss": 0.11860335618257523, + "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 @@ -6287,13 +6287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, - "loss": 0.0485, + "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, - "routers_loss": 0.011139829643070698, + "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 @@ -6306,13 +6306,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, - "loss": 0.0478, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, - "routers_loss": 0.03978770971298218, + "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 @@ -6327,11 +6327,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, - "loss": 0.0481, + "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, - "routers_loss": 0.051231011748313904, + "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 @@ -6344,13 +6344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, - "loss": 0.0615, + "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, - "routers_loss": 0.024917088449001312, + "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 @@ -6363,13 +6363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, - "loss": 0.0627, + "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, - "routers_loss": 0.008834881708025932, + "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 @@ -6382,13 +6382,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, - "routers_loss": 0.033405229449272156, + "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 @@ -6401,13 +6401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, - "loss": 0.0523, + "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, - "routers_loss": 0.020753704011440277, + "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 @@ -6420,13 +6420,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, - "loss": 0.0698, + "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, - "routers_loss": 0.06932353973388672, + "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 @@ -6439,13 +6439,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, - "loss": 0.059, + "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, - "routers_loss": 0.032903749495744705, + "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 @@ -6458,32 +6458,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.2099609375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, - "loss": 0.0417, + "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, - "routers_loss": 0.19733747839927673, + "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.2019371881420606, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.154296875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, - "loss": 0.0729, - "macro_f1": 0.6666666865348816, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, - "routers_loss": 0.013452666811645031, + "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 @@ -6496,13 +6496,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, - "routers_loss": 0.05302857980132103, + "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 @@ -6515,13 +6515,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, - "loss": 0.0527, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, - "routers_loss": 0.08124994486570358, + "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 @@ -6534,13 +6534,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, - "loss": 0.0579, + "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, - "routers_loss": 0.058633625507354736, + "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 @@ -6553,13 +6553,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, - "loss": 0.0533, + "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, - "routers_loss": 0.04703643172979355, + "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 @@ -6572,13 +6572,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, - "loss": 0.0485, + "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, - "routers_loss": 0.11615128815174103, + "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 @@ -6591,13 +6591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, - "loss": 0.0718, + "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, - "routers_loss": 0.017403846606612206, + "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 @@ -6610,13 +6610,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, - "loss": 0.0444, + "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, - "routers_loss": 0.07067303359508514, + "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 @@ -6629,13 +6629,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, - "loss": 0.0527, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, - "routers_loss": 0.07131028175354004, + "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 @@ -6648,13 +6648,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, - "loss": 0.0629, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, - "routers_loss": 0.1152748316526413, + "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 @@ -6667,13 +6667,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, - "loss": 0.0663, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, - "routers_loss": 0.0077212234027683735, + "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 @@ -6686,13 +6686,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.099609375, + "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, - "loss": 0.0494, + "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, - "routers_loss": 0.02726336568593979, + "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 @@ -6705,13 +6705,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.32421875, + "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, - "loss": 0.0615, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, - "routers_loss": 0.0958542674779892, + "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 @@ -6724,13 +6724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, - "loss": 0.0583, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, - "routers_loss": 0.007100600749254227, + "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 @@ -6743,13 +6743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, - "loss": 0.0445, + "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, - "routers_loss": 0.0047812811098992825, + "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 @@ -6762,13 +6762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, - "loss": 0.052, + "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, - "routers_loss": 0.006643407512456179, + "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 @@ -6781,13 +6781,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2890625, + "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, - "loss": 0.0719, + "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, - "routers_loss": 0.0910436138510704, + "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 @@ -6800,13 +6800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, - "loss": 0.0649, + "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, - "routers_loss": 0.010978946462273598, + "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 @@ -6819,13 +6819,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, - "loss": 0.0543, + "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, - "routers_loss": 0.009956461377441883, + "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 @@ -6838,13 +6838,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, - "loss": 0.0412, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, - "routers_loss": 0.057210199534893036, + "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 @@ -6857,32 +6857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, - "loss": 0.0364, + "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, - "routers_loss": 0.01035996899008751, + "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.3991781626063986, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, - "loss": 0.0661, - "macro_f1": 0.3076923191547394, + "loss": 0.063, + "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, - "routers_loss": 0.18087820708751678, + "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 @@ -6895,32 +6895,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, - "loss": 0.0505, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, - "routers_loss": 0.04720238968729973, + "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.417963017317288, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.2216796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, - "loss": 0.0603, - "macro_f1": 0.6666666865348816, + "loss": 0.06, + "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, - "routers_loss": 0.015407778322696686, + "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 @@ -6933,13 +6933,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, - "loss": 0.0489, + "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, - "routers_loss": 0.060891781002283096, + "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 @@ -6947,18 +6947,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.436747872028177, - "f1_execute": 0.943396270275116, + "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, - "loss": 0.0825, - "macro_f1": 0.3144654333591461, + "loss": 0.0796, + "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, - "routers_loss": 0.1661442220211029, + "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 @@ -6966,18 +6966,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.446140299383622, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, - "loss": 0.0634, - "macro_f1": 0.3333333432674408, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, - "routers_loss": 0.02108248695731163, + "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 @@ -6990,13 +6990,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, - "loss": 0.0534, + "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, - "routers_loss": 0.08318125456571579, + "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 @@ -7009,13 +7009,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, - "loss": 0.0514, + "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, - "routers_loss": 0.015889234840869904, + "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 @@ -7028,13 +7028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, - "routers_loss": 0.01378484908491373, + "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 @@ -7047,13 +7047,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, - "loss": 0.076, + "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, - "routers_loss": 0.02673683874309063, + "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 @@ -7066,13 +7066,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, - "loss": 0.0778, + "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, - "routers_loss": 0.27776041626930237, + "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 @@ -7085,13 +7085,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, - "loss": 0.0575, + "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, - "routers_loss": 0.0575483962893486, + "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 @@ -7104,13 +7104,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, - "loss": 0.0478, + "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, - "routers_loss": 0.0458797849714756, + "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 @@ -7123,13 +7123,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, - "loss": 0.0701, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, - "routers_loss": 0.07850238680839539, + "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 @@ -7142,13 +7142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, - "loss": 0.0702, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, - "routers_loss": 0.009507967159152031, + "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 @@ -7161,13 +7161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, - "loss": 0.0543, + "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, - "routers_loss": 0.02620997279882431, + "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 @@ -7180,13 +7180,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, - "loss": 0.0791, + "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, - "routers_loss": 0.02798631228506565, + "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 @@ -7201,11 +7201,11 @@ "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, - "loss": 0.046, + "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, - "routers_loss": 0.16614431142807007, + "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 @@ -7218,13 +7218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.193359375, + "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, - "loss": 0.0669, + "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, - "routers_loss": 0.008541896007955074, + "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 @@ -7237,13 +7237,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, - "loss": 0.0478, + "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, - "routers_loss": 0.045411624014377594, + "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 @@ -7256,13 +7256,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.380859375, + "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, - "loss": 0.0689, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, - "routers_loss": 0.052299100905656815, + "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 @@ -7275,13 +7275,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, - "loss": 0.0602, + "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, - "routers_loss": 0.09140212833881378, + "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 @@ -7294,13 +7294,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.201171875, + "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, - "loss": 0.0475, + "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, - "routers_loss": 0.030750583857297897, + "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 @@ -7313,13 +7313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, - "loss": 0.052, + "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, - "routers_loss": 0.010202950797975063, + "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 @@ -7332,13 +7332,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, - "loss": 0.0434, + "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, - "routers_loss": 0.07364192605018616, + "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 @@ -7353,11 +7353,11 @@ "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, - "loss": 0.0606, + "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, - "routers_loss": 0.06553081423044205, + "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 @@ -7370,13 +7370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, - "loss": 0.0475, + "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, - "routers_loss": 0.008751659654080868, + "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 @@ -7389,13 +7389,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.12060546875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, - "loss": 0.0777, + "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, - "routers_loss": 0.24522721767425537, + "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 @@ -7408,13 +7408,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, - "routers_loss": 0.03767623379826546, + "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 @@ -7427,13 +7427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, - "loss": 0.0708, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, - "routers_loss": 0.006098059006035328, + "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 @@ -7446,13 +7446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, - "loss": 0.0589, + "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, - "routers_loss": 0.01731623336672783, + "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 @@ -7465,13 +7465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, - "loss": 0.0526, + "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, - "routers_loss": 0.0076471962966024876, + "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 @@ -7484,13 +7484,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, - "loss": 0.0745, + "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, - "routers_loss": 0.0637628585100174, + "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 @@ -7503,13 +7503,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, - "loss": 0.0699, + "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, - "routers_loss": 0.10882514715194702, + "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 @@ -7522,13 +7522,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, - "loss": 0.056, + "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, - "routers_loss": 0.10924118757247925, + "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 @@ -7541,13 +7541,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, - "loss": 0.0664, + "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, - "routers_loss": 0.06571807712316513, + "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 @@ -7560,13 +7560,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, - "loss": 0.0616, + "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, - "routers_loss": 0.058966971933841705, + "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 @@ -7579,32 +7579,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, - "loss": 0.067, + "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, - "routers_loss": 0.04844636470079422, + "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, - "acc_skip": 0.4000000059604645, - "avg_layers": 26.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, "epoch": 3.756090402113296, - "f1_execute": 0.9166666865348816, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.5714285969734192, - "grad_norm": 0.1416015625, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, - "loss": 0.0634, - "macro_f1": 0.4960317611694336, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, - "routers_loss": 0.1591777801513672, + "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 @@ -7617,13 +7617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, - "loss": 0.0694, + "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, - "routers_loss": 0.017735568806529045, + "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 @@ -7636,13 +7636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, - "loss": 0.0477, + "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, - "routers_loss": 0.012401525862514973, + "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 @@ -7655,13 +7655,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, - "loss": 0.0735, + "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, - "routers_loss": 0.10736164450645447, + "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 @@ -7674,13 +7674,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, - "loss": 0.0428, + "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, - "routers_loss": 0.0595436617732048, + "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 @@ -7693,13 +7693,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.240234375, + "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, - "loss": 0.0494, + "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, - "routers_loss": 0.12617000937461853, + "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 @@ -7712,13 +7712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, - "loss": 0.0537, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, - "routers_loss": 0.021242506802082062, + "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 @@ -7731,13 +7731,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, - "loss": 0.0617, + "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, - "routers_loss": 0.10387415438890457, + "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 @@ -7750,13 +7750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, - "loss": 0.0565, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, - "routers_loss": 0.007816939614713192, + "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 @@ -7769,13 +7769,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, - "loss": 0.0654, + "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, - "routers_loss": 0.22526368498802185, + "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 @@ -7788,13 +7788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, - "loss": 0.056, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, - "routers_loss": 0.010998851619660854, + "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 @@ -7807,13 +7807,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, - "loss": 0.0712, + "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, - "routers_loss": 0.07115054875612259, + "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 @@ -7826,13 +7826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, - "loss": 0.0611, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, - "routers_loss": 0.008062695153057575, + "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 @@ -7845,13 +7845,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, - "loss": 0.0496, + "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, - "routers_loss": 0.16666285693645477, + "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 @@ -7864,13 +7864,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, - "loss": 0.0567, + "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, - "routers_loss": 0.0908689796924591, + "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 @@ -7883,13 +7883,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, - "loss": 0.0648, + "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, - "routers_loss": 0.04364728182554245, + "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 @@ -7897,18 +7897,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.906369239800411, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, - "loss": 0.0772, - "macro_f1": 0.3076923191547394, - "num_tokens": 1344232.0, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, "repeat_count": 1.0, - "routers_loss": 0.15315109491348267, + "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 @@ -7921,13 +7921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, - "loss": 0.0527, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, - "routers_loss": 0.01342768594622612, + "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 @@ -7940,13 +7940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, - "loss": 0.0513, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, - "routers_loss": 0.01158806961029768, + "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 @@ -7959,13 +7959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, - "loss": 0.0549, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, - "routers_loss": 0.01255605649203062, + "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 @@ -7978,13 +7978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, - "loss": 0.0578, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, - "routers_loss": 0.010225459933280945, + "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 @@ -7997,13 +7997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, - "loss": 0.0633, + "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, - "routers_loss": 0.007837744429707527, + "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 @@ -8016,13 +8016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, - "loss": 0.0674, + "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, - "routers_loss": 0.008631376549601555, + "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 @@ -8035,13 +8035,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, - "loss": 0.0612, + "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, - "routers_loss": 0.06206027418375015, + "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 @@ -8049,18 +8049,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 3.9815086586439685, - "f1_execute": 0.9411765336990356, + "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.16015625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, - "loss": 0.0678, - "macro_f1": 0.480392187833786, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, - "routers_loss": 0.1463794708251953, + "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 @@ -8073,13 +8073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, - "loss": 0.052, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, - "routers_loss": 0.01140492781996727, + "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 @@ -8092,13 +8092,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, - "loss": 0.0563, + "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, - "routers_loss": 0.05136030167341232, + "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 @@ -8111,32 +8111,32 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, - "loss": 0.0627, + "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, - "routers_loss": 0.07274381071329117, + "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 4.01878485471089, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, - "loss": 0.0503, - "macro_f1": 0.3272727429866791, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, - "routers_loss": 0.024335317313671112, + "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 @@ -8149,13 +8149,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, - "loss": 0.0359, + "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, - "routers_loss": 0.046614740043878555, + "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 @@ -8168,13 +8168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, - "loss": 0.0372, + "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, - "routers_loss": 0.006380240898579359, + "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 @@ -8187,13 +8187,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, - "loss": 0.0473, + "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, - "routers_loss": 0.04770716652274132, + "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 @@ -8206,32 +8206,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, - "loss": 0.0434, + "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, - "routers_loss": 0.006983708590269089, + "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.065746991488113, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.080078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, - "loss": 0.0476, - "macro_f1": 0.32098764181137085, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, - "routers_loss": 0.046313900500535965, + "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 @@ -8244,32 +8244,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, - "routers_loss": 0.0401870422065258, + "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.084531846199002, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, - "loss": 0.0505, - "macro_f1": 0.32098764181137085, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, - "routers_loss": 0.03313451260328293, + "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 @@ -8282,13 +8282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, - "loss": 0.0468, + "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, - "routers_loss": 0.010118982754647732, + "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 @@ -8301,13 +8301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, - "loss": 0.0498, + "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, - "routers_loss": 0.005856200121343136, + "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 @@ -8320,13 +8320,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, - "loss": 0.0417, + "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, - "routers_loss": 0.02768322452902794, + "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 @@ -8339,13 +8339,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, - "loss": 0.0419, + "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, - "routers_loss": 0.09382032603025436, + "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 @@ -8358,32 +8358,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.103515625, + "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, - "loss": 0.0466, + "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, - "routers_loss": 0.026867631822824478, + "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 4.14088641033167, - "f1_execute": 0.95652174949646, + "f1_execute": 0.936170220375061, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.26171875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, - "loss": 0.0496, - "macro_f1": 0.5855072736740112, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, - "routers_loss": 0.12731307744979858, + "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 @@ -8396,13 +8396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, - "loss": 0.039, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, - "routers_loss": 0.008483970537781715, + "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 @@ -8415,13 +8415,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, - "loss": 0.0348, + "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, - "routers_loss": 0.07847871631383896, + "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 @@ -8434,13 +8434,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, - "loss": 0.036, + "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, - "routers_loss": 0.04574459046125412, + "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 @@ -8453,13 +8453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, - "loss": 0.0485, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, - "routers_loss": 0.004683624487370253, + "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 @@ -8472,13 +8472,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, - "loss": 0.0476, + "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, - "routers_loss": 0.06499828398227692, + "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 @@ -8486,18 +8486,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 4.197240974464338, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, - "loss": 0.0326, - "macro_f1": 0.44705885648727417, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, - "routers_loss": 0.08285653591156006, + "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 @@ -8510,13 +8510,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, - "loss": 0.0497, + "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, - "routers_loss": 0.04252336546778679, + "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 @@ -8524,18 +8524,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.216025829175227, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, - "loss": 0.0349, - "macro_f1": 0.31446540355682373, + "loss": 0.034, + "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, - "routers_loss": 0.126711905002594, + "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 @@ -8548,13 +8548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, - "loss": 0.0392, + "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, - "routers_loss": 0.00955706462264061, + "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 @@ -8567,13 +8567,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, - "loss": 0.0377, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, - "routers_loss": 0.03127318620681763, + "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 @@ -8586,13 +8586,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, - "loss": 0.0389, + "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, - "routers_loss": 0.06743519753217697, + "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 @@ -8605,13 +8605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, - "loss": 0.0477, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, - "routers_loss": 0.0025313226506114006, + "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 @@ -8624,13 +8624,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, - "loss": 0.052, + "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, - "routers_loss": 0.037147488445043564, + "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 @@ -8643,13 +8643,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, - "loss": 0.0501, + "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, - "routers_loss": 0.021232586354017258, + "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 @@ -8662,13 +8662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, - "loss": 0.0759, + "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, - "routers_loss": 0.010563847608864307, + "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 @@ -8681,13 +8681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, - "loss": 0.0425, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, - "routers_loss": 0.0267612524330616, + "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 @@ -8700,13 +8700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, - "loss": 0.0501, + "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, - "routers_loss": 0.005838244222104549, + "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 @@ -8719,13 +8719,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, - "loss": 0.0383, + "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, - "routers_loss": 0.014642171561717987, + "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 @@ -8738,32 +8738,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, - "loss": 0.0457, + "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, - "routers_loss": 0.04119620472192764, + "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.328734957440563, - "f1_execute": 0.943396270275116, - "f1_repeat": 0.0, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, - "loss": 0.0433, - "macro_f1": 0.3144654333591461, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, - "routers_loss": 0.06713195145130157, + "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 @@ -8776,13 +8776,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1572265625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, - "loss": 0.0533, + "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, - "routers_loss": 0.024023180827498436, + "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 @@ -8795,13 +8795,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, - "loss": 0.0373, + "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, - "routers_loss": 0.05399841442704201, + "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 @@ -8814,13 +8814,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, - "loss": 0.0488, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, - "routers_loss": 0.0299264844506979, + "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 @@ -8833,13 +8833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, - "loss": 0.0467, + "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, - "routers_loss": 0.023478010669350624, + "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 @@ -8852,13 +8852,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.1103515625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, - "loss": 0.0447, + "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, - "routers_loss": 0.12116194516420364, + "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 @@ -8871,13 +8871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, - "loss": 0.053, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, - "routers_loss": 0.011879723519086838, + "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 @@ -8890,13 +8890,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, - "loss": 0.0373, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, - "routers_loss": 0.009245929308235645, + "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 @@ -8909,13 +8909,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, - "loss": 0.0461, + "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, - "routers_loss": 0.032464127987623215, + "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 @@ -8928,13 +8928,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, - "loss": 0.0515, + "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, - "routers_loss": 0.08835392445325851, + "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 @@ -8947,13 +8947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, - "loss": 0.0405, + "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, - "routers_loss": 0.014307246543467045, + "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 @@ -8966,13 +8966,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, - "loss": 0.0449, + "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, - "routers_loss": 0.10261563211679459, + "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 @@ -8985,13 +8985,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, - "loss": 0.0507, + "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, - "routers_loss": 0.03316422924399376, + "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 @@ -9004,13 +9004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, - "loss": 0.0352, + "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, - "routers_loss": 0.00902469176799059, + "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 @@ -9023,13 +9023,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, - "loss": 0.0581, + "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, - "routers_loss": 0.06710167229175568, + "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 @@ -9042,13 +9042,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, - "loss": 0.0294, + "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, - "routers_loss": 0.04208769276738167, + "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 @@ -9061,13 +9061,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, - "loss": 0.0505, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, - "routers_loss": 0.06530380249023438, + "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 @@ -9080,13 +9080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, - "loss": 0.0545, + "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, - "routers_loss": 0.01803453080356121, + "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 @@ -9099,13 +9099,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, - "loss": 0.0481, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, - "routers_loss": 0.08461762219667435, + "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 @@ -9120,11 +9120,11 @@ "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, - "loss": 0.0441, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, - "routers_loss": 0.007111486047506332, + "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 @@ -9137,13 +9137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11181640625, + "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, - "loss": 0.0361, + "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, - "routers_loss": 0.029776185750961304, + "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 @@ -9156,13 +9156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, - "loss": 0.0506, + "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, - "routers_loss": 0.007108999416232109, + "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 @@ -9175,13 +9175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, - "loss": 0.0498, + "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, - "routers_loss": 0.01126947533339262, + "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 @@ -9194,13 +9194,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, - "loss": 0.0366, + "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, - "routers_loss": 0.05142999067902565, + "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 @@ -9213,13 +9213,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, - "loss": 0.0461, + "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, - "routers_loss": 0.010770819149911404, + "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 @@ -9232,13 +9232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, - "loss": 0.0692, + "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, - "routers_loss": 0.008775795809924603, + "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 @@ -9251,13 +9251,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, - "loss": 0.0403, + "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, - "routers_loss": 0.05100395902991295, + "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 @@ -9270,13 +9270,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25390625, + "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, - "loss": 0.0503, + "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, - "routers_loss": 0.06653711199760437, + "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 @@ -9284,18 +9284,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.591722923393014, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, - "loss": 0.0435, - "macro_f1": 0.3333333432674408, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, - "routers_loss": 0.009865665808320045, + "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 @@ -9308,13 +9308,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.130859375, + "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, - "loss": 0.0399, + "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, - "routers_loss": 0.021175632253289223, + "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 @@ -9327,32 +9327,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, - "loss": 0.0472, + "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, - "routers_loss": 0.011803832836449146, + "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.800000011920929, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 4.6199002054593485, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.0, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, - "grad_norm": 0.142578125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, - "loss": 0.0467, - "macro_f1": 0.5696970224380493, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, - "routers_loss": 0.08856551349163055, + "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 @@ -9365,13 +9365,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, - "loss": 0.0413, + "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, - "routers_loss": 0.08593414723873138, + "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 @@ -9384,13 +9384,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, - "loss": 0.0399, + "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, - "routers_loss": 0.07049509882926941, + "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 @@ -9403,13 +9403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, - "loss": 0.0595, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, - "routers_loss": 0.0019258069805800915, + "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 @@ -9422,13 +9422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, - "loss": 0.0335, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, - "routers_loss": 0.014094089157879353, + "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 @@ -9436,18 +9436,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 31.0, "epoch": 4.666862342236572, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, - "loss": 0.0603, - "macro_f1": 0.6527777910232544, + "loss": 0.06, + "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, - "routers_loss": 0.06360147893428802, + "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 @@ -9460,13 +9460,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10546875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, - "loss": 0.0573, + "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, - "routers_loss": 0.0326208658516407, + "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 @@ -9479,13 +9479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, - "loss": 0.0502, + "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, - "routers_loss": 0.012660752050578594, + "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 @@ -9498,13 +9498,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, - "loss": 0.0537, + "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, - "routers_loss": 0.021756287664175034, + "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 @@ -9512,37 +9512,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.70443205165835, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, - "loss": 0.0447, - "macro_f1": 0.31446540355682373, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, - "routers_loss": 0.07292548567056656, + "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { - "acc_repeat": 0.5, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 4.713824479013795, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, - "loss": 0.0505, - "macro_f1": 0.5492662787437439, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, - "routers_loss": 0.03397528454661369, + "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 @@ -9555,13 +9555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, - "loss": 0.0544, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, - "routers_loss": 0.005987613927572966, + "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 @@ -9574,13 +9574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, - "loss": 0.0522, + "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, - "routers_loss": 0.021656684577465057, + "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 @@ -9593,32 +9593,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, - "loss": 0.0487, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, - "routers_loss": 0.0014992234064266086, + "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.751394188435574, - "f1_execute": 0.9411764740943909, - "f1_repeat": 0.0, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, - "loss": 0.0491, - "macro_f1": 0.5359477400779724, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, - "routers_loss": 0.03448791801929474, + "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 @@ -9631,13 +9631,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, - "loss": 0.0541, + "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, - "routers_loss": 0.08285929262638092, + "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 @@ -9650,13 +9650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, - "loss": 0.0515, + "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, - "routers_loss": 0.00486504752188921, + "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 @@ -9669,13 +9669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, - "loss": 0.051, + "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, - "routers_loss": 0.017805909737944603, + "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 @@ -9688,13 +9688,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, - "loss": 0.0547, + "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, - "routers_loss": 0.12958799302577972, + "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 @@ -9707,13 +9707,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1533203125, + "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, - "loss": 0.0595, + "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, - "routers_loss": 0.07447081059217453, + "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 @@ -9726,13 +9726,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, - "loss": 0.0619, + "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, - "routers_loss": 0.12529553472995758, + "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 @@ -9745,13 +9745,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, - "loss": 0.0491, + "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, - "routers_loss": 0.08664281666278839, + "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 @@ -9764,13 +9764,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, - "loss": 0.0394, + "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, - "routers_loss": 0.0067965323105454445, + "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 @@ -9783,13 +9783,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, - "loss": 0.0607, + "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, - "routers_loss": 0.08319470286369324, + "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 @@ -9802,13 +9802,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, - "loss": 0.0476, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, - "routers_loss": 0.002826537238433957, + "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 @@ -9823,11 +9823,11 @@ "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, - "loss": 0.0431, + "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, - "routers_loss": 0.03491095453500748, + "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 @@ -9840,13 +9840,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, - "loss": 0.0616, + "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, - "routers_loss": 0.1828247457742691, + "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 @@ -9859,13 +9859,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, - "loss": 0.0563, + "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, - "routers_loss": 0.010931054130196571, + "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 @@ -9878,13 +9878,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, - "loss": 0.0569, + "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, - "routers_loss": 0.023222090676426888, + "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 @@ -9897,13 +9897,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, - "loss": 0.0453, + "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, - "routers_loss": 0.07085686922073364, + "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 @@ -9916,13 +9916,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, - "loss": 0.0515, + "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, - "routers_loss": 0.010158216580748558, + "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 @@ -9935,13 +9935,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, - "loss": 0.0372, + "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, - "routers_loss": 0.007876895368099213, + "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 @@ -9954,13 +9954,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, - "loss": 0.0501, + "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, - "routers_loss": 0.004859173204749823, + "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 @@ -9973,13 +9973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, - "loss": 0.0582, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, - "routers_loss": 0.01798083633184433, + "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 @@ -9987,18 +9987,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.939242735544467, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, - "macro_f1": 0.3272727429866791, + "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, - "routers_loss": 0.014295363798737526, + "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 @@ -10011,13 +10011,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, - "loss": 0.0635, + "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, - "routers_loss": 0.1038367822766304, + "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 @@ -10030,13 +10030,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, - "loss": 0.057, + "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, - "routers_loss": 0.029780643060803413, + "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 @@ -10049,13 +10049,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, - "loss": 0.0398, + "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, - "routers_loss": 0.008537676185369492, + "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 @@ -10068,13 +10068,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, - "loss": 0.0617, + "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, - "routers_loss": 0.03966755419969559, + "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 @@ -10087,13 +10087,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, - "loss": 0.0311, + "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, - "routers_loss": 0.06651833653450012, + "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 @@ -10106,13 +10106,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2431640625, + "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, - "loss": 0.0557, + "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, - "routers_loss": 0.05130369961261749, + "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 @@ -10125,13 +10125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, - "loss": 0.0435, + "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, - "routers_loss": 0.020534176379442215, + "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 @@ -10144,13 +10144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, - "loss": 0.0305, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, - "routers_loss": 0.007514918688684702, + "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 @@ -10163,13 +10163,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, - "loss": 0.0461, + "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, - "routers_loss": 0.024598751217126846, + "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 @@ -10182,32 +10182,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, - "loss": 0.0408, + "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, - "routers_loss": 0.011866633780300617, + "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.042265923099501, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, - "loss": 0.036, - "macro_f1": 0.3272727429866791, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, - "routers_loss": 0.01755746826529503, + "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 @@ -10220,13 +10220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, - "loss": 0.0415, + "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, - "routers_loss": 0.003976983483880758, + "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 @@ -10239,13 +10239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, - "loss": 0.0378, + "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, - "routers_loss": 0.013548235408961773, + "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 @@ -10258,13 +10258,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, - "routers_loss": 0.03923165053129196, + "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 @@ -10277,13 +10277,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.1923828125, + "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, - "loss": 0.0464, + "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, - "routers_loss": 0.046429626643657684, + "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 @@ -10296,13 +10296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, - "loss": 0.0346, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, - "routers_loss": 0.008998732082545757, + "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 @@ -10315,13 +10315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, - "loss": 0.0386, + "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, - "routers_loss": 0.005173585377633572, + "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 @@ -10334,13 +10334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, - "loss": 0.0308, + "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, - "routers_loss": 0.024098891764879227, + "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 @@ -10353,13 +10353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, - "routers_loss": 0.014002764597535133, + "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 @@ -10372,32 +10372,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.14453125, + "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, - "loss": 0.0462, + "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, - "routers_loss": 0.017871137708425522, + "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.136190196653947, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, - "macro_f1": 0.32098764181137085, + "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, - "routers_loss": 0.033123619854450226, + "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 @@ -10410,13 +10410,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, - "loss": 0.0267, + "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, - "routers_loss": 0.01279227901250124, + "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 @@ -10429,13 +10429,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, - "loss": 0.0295, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, - "routers_loss": 0.001354650012217462, + "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 @@ -10448,13 +10448,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, - "loss": 0.0326, + "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, - "routers_loss": 0.1097714751958847, + "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 @@ -10467,13 +10467,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, - "loss": 0.0187, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, - "routers_loss": 0.009649592451751232, + "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 @@ -10486,13 +10486,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11083984375, + "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, - "loss": 0.0447, + "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, - "routers_loss": 0.020858706906437874, + "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 @@ -10505,13 +10505,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, - "loss": 0.0428, + "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, - "routers_loss": 0.048101235181093216, + "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 @@ -10524,13 +10524,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, - "loss": 0.0435, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, - "routers_loss": 0.02875671721994877, + "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 @@ -10543,13 +10543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, - "loss": 0.0247, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, - "routers_loss": 0.019005145877599716, + "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 @@ -10562,13 +10562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, - "loss": 0.0393, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, - "routers_loss": 0.007238700054585934, + "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 @@ -10581,13 +10581,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, - "routers_loss": 0.12206140905618668, + "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 @@ -10595,18 +10595,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 5.239506897563839, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, - "loss": 0.0366, - "macro_f1": 0.29333335161209106, + "loss": 0.035, + "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, - "routers_loss": 0.15721899271011353, + "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 @@ -10619,32 +10619,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.125, + "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, - "loss": 0.0366, + "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, - "routers_loss": 0.05058665946125984, + "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 5.258291752274729, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "f1_skip": 1.0, + "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, - "loss": 0.0454, - "macro_f1": 0.32098764181137085, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, - "routers_loss": 0.023021472617983818, + "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 @@ -10657,13 +10657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, - "loss": 0.0409, + "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, - "routers_loss": 0.006660689599812031, + "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 @@ -10676,13 +10676,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.146484375, + "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, - "loss": 0.0547, + "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, - "routers_loss": 0.031727343797683716, + "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 @@ -10695,13 +10695,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, - "loss": 0.0351, + "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, - "routers_loss": 0.06140992045402527, + "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 @@ -10709,18 +10709,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 5.295861461696507, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, - "loss": 0.0436, - "macro_f1": 0.44705885648727417, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, - "routers_loss": 0.03872275352478027, + "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 @@ -10728,18 +10728,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.305253889051952, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, - "loss": 0.0353, - "macro_f1": 0.3272727429866791, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, - "routers_loss": 0.031013142317533493, + "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 @@ -10752,13 +10752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, - "loss": 0.0333, + "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, - "routers_loss": 0.004357635974884033, + "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 @@ -10771,13 +10771,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11279296875, + "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, - "routers_loss": 0.2376353144645691, + "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 @@ -10790,13 +10790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, - "loss": 0.0446, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, - "routers_loss": 0.004978097043931484, + "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 @@ -10809,13 +10809,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, - "loss": 0.0309, + "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, - "routers_loss": 0.14195409417152405, + "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 @@ -10828,13 +10828,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.15234375, + "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, - "loss": 0.0403, + "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, - "routers_loss": 0.04005253314971924, + "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 @@ -10847,13 +10847,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, - "routers_loss": 0.006839688867330551, + "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 @@ -10866,13 +10866,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, - "loss": 0.0396, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, - "routers_loss": 0.10174567997455597, + "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 @@ -10885,13 +10885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, - "loss": 0.0365, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, - "routers_loss": 0.014655748382210732, + "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 @@ -10904,13 +10904,13 @@ "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, - "loss": 0.0467, + "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, - "routers_loss": 0.16960746049880981, + "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 @@ -10923,32 +10923,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, - "loss": 0.0399, + "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, - "routers_loss": 0.011591178365051746, + "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { - "acc_repeat": 0.5, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 30.0, "epoch": 5.408570589961843, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, - "loss": 0.0332, - "macro_f1": 0.5492662787437439, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, - "routers_loss": 0.04036068916320801, + "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 @@ -10961,13 +10961,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, - "loss": 0.0366, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, - "routers_loss": 0.030165785923600197, + "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 @@ -10980,13 +10980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, - "loss": 0.0421, + "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, - "routers_loss": 0.003615695284679532, + "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 @@ -10999,13 +10999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, - "loss": 0.0377, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, - "routers_loss": 0.01679840311408043, + "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 @@ -11018,13 +11018,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, - "loss": 0.0422, + "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, - "routers_loss": 0.024936161935329437, + "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 @@ -11037,13 +11037,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, - "loss": 0.0407, + "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, - "routers_loss": 0.06472968310117722, + "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 @@ -11056,13 +11056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, - "loss": 0.031, + "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, - "routers_loss": 0.009633494541049004, + "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 @@ -11075,13 +11075,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, - "routers_loss": 0.01458993274718523, + "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 @@ -11094,13 +11094,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, - "loss": 0.0389, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, - "routers_loss": 0.06636982411146164, + "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 @@ -11113,13 +11113,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, - "loss": 0.0325, + "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, - "routers_loss": 0.005644182674586773, + "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 @@ -11132,13 +11132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, - "loss": 0.0402, + "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, - "routers_loss": 0.010273848660290241, + "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 @@ -11151,13 +11151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, - "loss": 0.0415, + "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, - "routers_loss": 0.004529652185738087, + "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 @@ -11170,13 +11170,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, - "loss": 0.045, + "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, - "routers_loss": 0.024671228602528572, + "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 @@ -11189,13 +11189,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, - "loss": 0.0354, + "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, - "routers_loss": 0.06466450542211533, + "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 @@ -11208,13 +11208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, - "loss": 0.0278, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, - "routers_loss": 0.010566026903688908, + "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 @@ -11227,13 +11227,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, - "loss": 0.037, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, - "routers_loss": 0.013842248357832432, + "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 @@ -11246,13 +11246,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07373046875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, - "loss": 0.0215, + "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, - "routers_loss": 0.122759610414505, + "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 @@ -11265,32 +11265,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, - "loss": 0.0402, + "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, - "routers_loss": 0.035315629094839096, + "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.577634282359847, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, - "loss": 0.034, - "macro_f1": 0.32098764181137085, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, - "routers_loss": 0.040048226714134216, + "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 @@ -11298,18 +11298,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.587026709715292, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, - "loss": 0.044, - "macro_f1": 0.3333333432674408, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, - "routers_loss": 0.012814820744097233, + "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 @@ -11322,13 +11322,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, - "loss": 0.0407, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, - "routers_loss": 0.05977959558367729, + "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 @@ -11341,13 +11341,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.1533203125, + "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, - "loss": 0.0334, + "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, - "routers_loss": 0.031448643654584885, + "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 @@ -11360,13 +11360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, - "loss": 0.0523, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, - "routers_loss": 0.008164947852492332, + "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 @@ -11379,32 +11379,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, - "loss": 0.0478, + "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, - "routers_loss": 0.04045635461807251, + "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 5.633988846492516, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.1240234375, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, - "loss": 0.0447, - "macro_f1": 0.5866667032241821, + "loss": 0.0441, + "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, - "routers_loss": 0.06872973591089249, + "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 @@ -11412,18 +11412,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 5.64338127384796, - "f1_execute": 0.8695651888847351, + "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.25390625, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, - "loss": 0.0331, - "macro_f1": 0.4231884181499481, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, - "routers_loss": 0.20964151620864868, + "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 @@ -11442,26 +11442,26 @@ "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, - "routers_loss": 0.00690250750631094, + "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.66216612855885, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.14453125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, - "loss": 0.0372, - "macro_f1": 0.32098764181137085, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, - "routers_loss": 0.022315658628940582, + "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 @@ -11474,13 +11474,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1083984375, + "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, - "loss": 0.0388, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, - "routers_loss": 0.017015069723129272, + "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 @@ -11493,13 +11493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, - "loss": 0.0372, + "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, - "routers_loss": 0.006532609928399324, + "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 @@ -11512,13 +11512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, - "loss": 0.0301, + "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, - "routers_loss": 0.009720963425934315, + "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 @@ -11531,32 +11531,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, - "loss": 0.046, + "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, - "routers_loss": 0.03176085278391838, + "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, - "acc_skip": 0.5, - "avg_layers": 29.0, + "acc_skip": 0.0, + "avg_layers": 30.0, "epoch": 5.709128265336073, - "f1_execute": 0.9387754797935486, + "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.2021484375, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, - "loss": 0.0323, - "macro_f1": 0.7018141150474548, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, - "routers_loss": 0.08626245707273483, + "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 @@ -11569,13 +11569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, - "loss": 0.0374, + "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, - "routers_loss": 0.012099343352019787, + "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 @@ -11588,13 +11588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, - "loss": 0.0342, + "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, - "routers_loss": 0.007713846862316132, + "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 @@ -11607,13 +11607,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, - "loss": 0.0499, + "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, - "routers_loss": 0.004629489034414291, + "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 @@ -11626,13 +11626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, - "loss": 0.035, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, - "routers_loss": 0.010654795914888382, + "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 @@ -11640,18 +11640,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 5.756090402113296, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, - "loss": 0.0516, - "macro_f1": 0.31446540355682373, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, - "routers_loss": 0.05963074415922165, + "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 @@ -11664,13 +11664,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, - "loss": 0.04, + "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, - "routers_loss": 0.028920643031597137, + "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 @@ -11678,18 +11678,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.774875256824186, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, - "loss": 0.0327, - "macro_f1": 0.3333333432674408, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, - "routers_loss": 0.006852717138826847, + "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 @@ -11702,13 +11702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, - "loss": 0.0346, + "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, - "routers_loss": 0.010968753136694431, + "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 @@ -11721,13 +11721,13 @@ "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.1240234375, + "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, - "loss": 0.0344, + "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, - "routers_loss": 0.0906950980424881, + "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 @@ -11740,13 +11740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, - "loss": 0.0484, + "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, - "routers_loss": 0.016306072473526, + "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 @@ -11759,13 +11759,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.208984375, + "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, - "loss": 0.038, + "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, - "routers_loss": 0.05804471671581268, + "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 @@ -11773,18 +11773,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 5.821837393601409, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.091796875, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, - "loss": 0.0256, - "macro_f1": 0.5492662787437439, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, - "routers_loss": 0.045395996421575546, + "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 @@ -11797,13 +11797,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11767578125, + "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, - "loss": 0.0438, + "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, - "routers_loss": 0.020834850147366524, + "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 @@ -11816,13 +11816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, - "loss": 0.0377, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, - "routers_loss": 0.005241698585450649, + "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 @@ -11835,13 +11835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, - "loss": 0.038, + "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, - "routers_loss": 0.008387803100049496, + "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 @@ -11854,13 +11854,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, - "loss": 0.0256, + "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, - "routers_loss": 0.02261745184659958, + "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 @@ -11873,32 +11873,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.10693359375, + "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, - "loss": 0.0435, + "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, - "routers_loss": 0.21678555011749268, + "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.878191957734077, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, - "loss": 0.0358, - "macro_f1": 0.32098764181137085, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, - "routers_loss": 0.06554054468870163, + "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 @@ -11911,13 +11911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, - "loss": 0.0337, + "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, - "routers_loss": 0.007237187586724758, + "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 @@ -11930,13 +11930,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, - "loss": 0.0387, + "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, - "routers_loss": 0.08003793656826019, + "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 @@ -11949,13 +11949,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, - "loss": 0.0341, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, - "routers_loss": 0.006424390245229006, + "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 @@ -11968,13 +11968,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, - "loss": 0.0482, + "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, - "routers_loss": 0.10797854512929916, + "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 @@ -11987,13 +11987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, - "loss": 0.053, + "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, - "routers_loss": 0.006734046153724194, + "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 @@ -12006,13 +12006,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, - "loss": 0.0373, + "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, - "routers_loss": 0.00564212491735816, + "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 @@ -12025,13 +12025,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, - "loss": 0.0472, + "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, - "routers_loss": 0.026137735694646835, + "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 @@ -12044,13 +12044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, - "loss": 0.0343, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, - "routers_loss": 0.0069669694639742374, + "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 @@ -12063,13 +12063,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1142578125, + "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, - "loss": 0.0323, + "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, - "routers_loss": 0.05618409812450409, + "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 @@ -12084,11 +12084,11 @@ "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, - "loss": 0.0391, + "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, - "routers_loss": 0.025900620967149734, + "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 @@ -12101,13 +12101,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, - "loss": 0.0426, + "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, - "routers_loss": 0.006236737594008446, + "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 @@ -12120,13 +12120,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, - "loss": 0.0503, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, - "routers_loss": 0.006367063149809837, + "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 @@ -12139,13 +12139,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, - "loss": 0.0442, + "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, - "routers_loss": 0.003392914542928338, + "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 @@ -12158,13 +12158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, - "loss": 0.0231, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, - "routers_loss": 0.007376343477517366, + "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 @@ -12177,13 +12177,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, - "loss": 0.0243, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, - "routers_loss": 0.02773376554250717, + "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 @@ -12196,13 +12196,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, - "loss": 0.0363, + "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, - "routers_loss": 0.03535797819495201, + "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 @@ -12215,13 +12215,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, - "loss": 0.0269, + "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, - "routers_loss": 0.004910993855446577, + "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 @@ -12234,13 +12234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, - "loss": 0.0301, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, - "routers_loss": 0.0032444109674543142, + "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 @@ -12253,13 +12253,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.0947265625, + "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, - "loss": 0.0272, + "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, - "routers_loss": 0.12451831251382828, + "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 @@ -12272,13 +12272,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, - "loss": 0.0289, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, - "routers_loss": 0.011074979789555073, + "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 @@ -12291,13 +12291,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1806640625, + "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, - "loss": 0.0336, + "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, - "routers_loss": 0.01774786226451397, + "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 @@ -12310,13 +12310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, - "routers_loss": 0.0052874404937028885, + "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 @@ -12329,13 +12329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, - "loss": 0.031, + "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, - "routers_loss": 0.0034106262028217316, + "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 @@ -12348,13 +12348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2177734375, + "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, - "routers_loss": 0.010383229702711105, + "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 @@ -12367,13 +12367,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, - "loss": 0.0304, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, - "routers_loss": 0.0076674893498420715, + "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 @@ -12386,13 +12386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, - "loss": 0.0287, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, - "routers_loss": 0.010486516170203686, + "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 @@ -12405,13 +12405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, - "loss": 0.0237, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, - "routers_loss": 0.0023783023934811354, + "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 @@ -12424,13 +12424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, - "loss": 0.044, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, - "routers_loss": 0.006714595016092062, + "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 @@ -12443,13 +12443,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, - "loss": 0.0281, + "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, - "routers_loss": 0.022373953834176064, + "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 @@ -12462,13 +12462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, - "loss": 0.0205, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, - "routers_loss": 0.007452849764376879, + "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 @@ -12481,13 +12481,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, - "routers_loss": 0.016413308680057526, + "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 @@ -12500,13 +12500,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, - "loss": 0.0243, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, - "routers_loss": 0.004676427226513624, + "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 @@ -12519,13 +12519,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.123046875, + "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, - "loss": 0.0284, + "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, - "routers_loss": 0.024454625323414803, + "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 @@ -12538,13 +12538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, - "loss": 0.022, + "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, - "routers_loss": 0.013495884835720062, + "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 @@ -12557,13 +12557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, - "loss": 0.0213, + "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, - "routers_loss": 0.006397814955562353, + "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 @@ -12576,13 +12576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, - "loss": 0.0246, + "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, - "routers_loss": 0.00503434706479311, + "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 @@ -12595,13 +12595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, - "loss": 0.0402, + "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, - "routers_loss": 0.005150494631379843, + "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 @@ -12614,13 +12614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, - "loss": 0.041, + "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, - "routers_loss": 0.004570818971842527, + "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 @@ -12633,13 +12633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, - "loss": 0.0314, + "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, - "routers_loss": 0.005299333017319441, + "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 @@ -12652,13 +12652,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.111328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, - "loss": 0.0303, + "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, - "routers_loss": 0.168672576546669, + "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 @@ -12671,32 +12671,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, - "loss": 0.0302, + "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, - "routers_loss": 0.05187409743666649, + "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, "epoch": 6.272380393307896, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.9090909361839294, - "grad_norm": 0.1669921875, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, - "loss": 0.0339, - "macro_f1": 0.8329448699951172, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, - "routers_loss": 0.05786697566509247, + "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 @@ -12709,13 +12709,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, - "loss": 0.0315, + "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, - "routers_loss": 0.017055779695510864, + "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 @@ -12728,13 +12728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, - "loss": 0.0249, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, - "routers_loss": 0.011614206247031689, + "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 @@ -12747,13 +12747,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, - "loss": 0.033, + "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, - "routers_loss": 0.012611300684511662, + "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 @@ -12766,13 +12766,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, - "loss": 0.0315, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, - "routers_loss": 0.018757276237010956, + "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 @@ -12785,13 +12785,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, - "loss": 0.0229, + "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, - "routers_loss": 0.08197146654129028, + "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 @@ -12804,13 +12804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, - "loss": 0.0256, + "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, - "routers_loss": 0.014122758992016315, + "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 @@ -12823,13 +12823,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, - "loss": 0.0221, + "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, - "routers_loss": 0.08215996623039246, + "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 @@ -12842,13 +12842,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, - "loss": 0.0312, + "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, - "routers_loss": 0.026304977014660835, + "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 @@ -12861,13 +12861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, - "loss": 0.0302, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, - "routers_loss": 0.003616038942709565, + "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 @@ -12880,13 +12880,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0849609375, + "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, - "loss": 0.0323, + "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, - "routers_loss": 0.060399893671274185, + "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 @@ -12899,13 +12899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, - "loss": 0.0384, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, - "routers_loss": 0.007164204493165016, + "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 @@ -12918,13 +12918,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, - "loss": 0.0343, + "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, - "routers_loss": 0.010965060442686081, + "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 @@ -12937,13 +12937,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, - "loss": 0.0276, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, - "routers_loss": 0.00784136913716793, + "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 @@ -12956,13 +12956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, - "loss": 0.0251, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, - "routers_loss": 0.009101065807044506, + "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 @@ -12970,37 +12970,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 6.413266803639566, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, - "loss": 0.0206, - "macro_f1": 0.31446540355682373, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, - "routers_loss": 0.05967295169830322, + "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.25, + "avg_layers": 27.0, "epoch": 6.42265923099501, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1875, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, - "loss": 0.0356, - "macro_f1": 0.542222261428833, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, - "routers_loss": 0.05016552656888962, + "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 @@ -13013,13 +13013,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, - "loss": 0.038, + "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, - "routers_loss": 0.02483060024678707, + "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 @@ -13032,13 +13032,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, - "loss": 0.0373, + "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, - "routers_loss": 0.01982821337878704, + "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 @@ -13051,13 +13051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, - "loss": 0.0353, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, - "routers_loss": 0.004753436427563429, + "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 @@ -13070,13 +13070,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, - "loss": 0.0246, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, - "routers_loss": 0.06541594862937927, + "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 @@ -13089,13 +13089,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, - "loss": 0.0376, + "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, - "routers_loss": 0.01156456395983696, + "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 @@ -13108,13 +13108,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2392578125, + "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, - "loss": 0.033, + "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, - "routers_loss": 0.05175521597266197, + "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 @@ -13127,13 +13127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, - "loss": 0.0251, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, - "routers_loss": 0.002684591803699732, + "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 @@ -13146,13 +13146,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, - "loss": 0.0397, + "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, - "routers_loss": 0.054509978741407394, + "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 @@ -13165,13 +13165,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, - "routers_loss": 0.04031623527407646, + "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 @@ -13184,13 +13184,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, - "loss": 0.0308, + "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, - "routers_loss": 0.039687711745500565, + "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 @@ -13203,13 +13203,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, - "loss": 0.0329, + "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, - "routers_loss": 0.04785723611712456, + "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 @@ -13222,13 +13222,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, - "loss": 0.0359, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, - "routers_loss": 0.046765491366386414, + "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 @@ -13241,13 +13241,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, - "loss": 0.0303, + "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, - "routers_loss": 0.015151665546000004, + "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 @@ -13260,13 +13260,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, - "loss": 0.0321, + "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, - "routers_loss": 0.04431106895208359, + "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 @@ -13279,13 +13279,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, - "loss": 0.0317, + "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, - "routers_loss": 0.012700649909675121, + "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 @@ -13298,13 +13298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, - "loss": 0.0256, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, - "routers_loss": 0.005919010378420353, + "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 @@ -13317,13 +13317,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, - "loss": 0.0372, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, - "routers_loss": 0.04717765748500824, + "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 @@ -13336,13 +13336,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, - "routers_loss": 0.015415813773870468, + "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 @@ -13355,13 +13355,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, - "routers_loss": 0.06812979280948639, + "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 @@ -13374,13 +13374,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05908203125, + "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, - "loss": 0.0265, + "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, - "routers_loss": 0.025383235886693, + "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 @@ -13393,13 +13393,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, - "loss": 0.0367, + "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, - "routers_loss": 0.026493225246667862, + "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 @@ -13412,13 +13412,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, - "loss": 0.0342, + "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, - "routers_loss": 0.006616846192628145, + "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 @@ -13431,32 +13431,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0888671875, + "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, - "loss": 0.0328, + "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, - "routers_loss": 0.060631848871707916, + "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.648077487525683, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, - "loss": 0.0317, - "macro_f1": 0.6666666865348816, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, - "routers_loss": 0.011199389584362507, + "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 @@ -13469,13 +13469,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, - "loss": 0.0192, + "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, - "routers_loss": 0.01120354700833559, + "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 @@ -13488,13 +13488,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, - "loss": 0.0294, + "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, - "routers_loss": 0.030204148963093758, + "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 @@ -13507,13 +13507,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, - "loss": 0.0348, + "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, - "routers_loss": 0.008244800381362438, + "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 @@ -13526,32 +13526,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, - "loss": 0.0269, + "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, - "routers_loss": 0.015340043231844902, + "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, - "acc_skip": 0.6000000238418579, - "avg_layers": 25.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, "epoch": 6.695039624302906, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, - "f1_skip": 0.75, - "grad_norm": 0.1318359375, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, - "loss": 0.0341, - "macro_f1": 0.5694444179534912, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, - "routers_loss": 0.058681465685367584, + "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 @@ -13564,32 +13564,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, - "loss": 0.0423, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, - "routers_loss": 0.020810559391975403, + "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 6.713824479013795, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.09033203125, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, - "loss": 0.0268, - "macro_f1": 0.5492662787437439, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, - "routers_loss": 0.030001837760210037, + "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 @@ -13602,13 +13602,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, - "loss": 0.034, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, - "routers_loss": 0.010381575673818588, + "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 @@ -13621,32 +13621,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, - "loss": 0.0365, + "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, - "routers_loss": 0.03234704211354256, + "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.742001761080129, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, - "loss": 0.0303, - "macro_f1": 0.6666666865348816, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, - "routers_loss": 0.015481291338801384, + "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 @@ -13659,13 +13659,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, - "loss": 0.0366, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, - "routers_loss": 0.018170451745390892, + "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 @@ -13678,13 +13678,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, - "loss": 0.0306, + "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, - "routers_loss": 0.010385681875050068, + "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 @@ -13697,13 +13697,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, - "loss": 0.0407, + "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, - "routers_loss": 0.04976538568735123, + "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 @@ -13716,13 +13716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, - "loss": 0.0255, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, - "routers_loss": 0.0030266831163316965, + "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 @@ -13730,18 +13730,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 6.788963897857353, - "f1_execute": 0.9600000381469727, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.107421875, + "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, - "loss": 0.0411, - "macro_f1": 0.8200000524520874, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, - "routers_loss": 0.13420957326889038, + "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 @@ -13754,13 +13754,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1953125, + "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, - "loss": 0.0251, + "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, - "routers_loss": 0.012779864482581615, + "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 @@ -13773,13 +13773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, - "loss": 0.0266, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, - "routers_loss": 0.005545398220419884, + "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 @@ -13792,13 +13792,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, - "loss": 0.0395, + "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, - "routers_loss": 0.0899835154414177, + "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 @@ -13811,13 +13811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, - "routers_loss": 0.002738836221396923, + "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 @@ -13830,13 +13830,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, - "loss": 0.0266, + "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, - "routers_loss": 0.020522192120552063, + "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 @@ -13844,18 +13844,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.84531846199002, - "f1_execute": 0.9629629850387573, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, - "loss": 0.0197, - "macro_f1": 0.32098767161369324, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, - "routers_loss": 0.04245268926024437, + "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 @@ -13868,13 +13868,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, - "loss": 0.0376, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, - "routers_loss": 0.009142685681581497, + "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 @@ -13887,13 +13887,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, - "loss": 0.0295, + "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, - "routers_loss": 0.08038893342018127, + "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 @@ -13906,13 +13906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, - "loss": 0.027, + "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, - "routers_loss": 0.02107175625860691, + "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 @@ -13925,13 +13925,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, - "loss": 0.0349, + "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, - "routers_loss": 0.042030055075883865, + "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 @@ -13944,13 +13944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, - "loss": 0.022, + "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, - "routers_loss": 0.004230673424899578, + "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 @@ -13963,13 +13963,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, - "loss": 0.0284, + "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, - "routers_loss": 0.06986775249242783, + "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 @@ -13982,13 +13982,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, - "loss": 0.0431, + "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, - "routers_loss": 0.0439564548432827, + "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 @@ -14001,13 +14001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, - "loss": 0.0376, + "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, - "routers_loss": 0.011889892630279064, + "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 @@ -14020,13 +14020,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, - "loss": 0.0295, + "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, - "routers_loss": 0.022536326199769974, + "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 @@ -14039,13 +14039,13 @@ "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, - "loss": 0.0388, + "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, - "routers_loss": 0.07959948480129242, + "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 @@ -14058,13 +14058,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, - "loss": 0.0341, + "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, - "routers_loss": 0.019112225621938705, + "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 @@ -14077,13 +14077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, - "loss": 0.0345, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, - "routers_loss": 0.018750866875052452, + "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 @@ -14096,32 +14096,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, - "loss": 0.0291, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, - "routers_loss": 0.015407348051667213, + "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { - "acc_repeat": 0.6666666865348816, + "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 29.0, "epoch": 6.976812444966246, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.800000011920929, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, - "loss": 0.0258, - "macro_f1": 0.5934640765190125, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, - "routers_loss": 0.06514479219913483, + "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 @@ -14134,13 +14134,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, - "loss": 0.0217, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, - "routers_loss": 0.013567833229899406, + "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 @@ -14153,13 +14153,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, - "loss": 0.0389, + "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, - "routers_loss": 0.13762018084526062, + "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 @@ -14172,13 +14172,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, - "loss": 0.0214, + "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, - "routers_loss": 0.008640666492283344, + "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 @@ -14191,13 +14191,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, - "routers_loss": 0.0018257038900628686, + "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 @@ -14210,13 +14210,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, - "routers_loss": 0.003656312357634306, + "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 @@ -14229,13 +14229,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, - "loss": 0.0246, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, - "routers_loss": 0.044268425554037094, + "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 @@ -14248,13 +14248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, - "loss": 0.0207, + "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, - "routers_loss": 0.0018966116476804018, + "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 @@ -14267,32 +14267,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, - "loss": 0.0249, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, - "routers_loss": 0.01729201152920723, + "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 7.06105077781039, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.11962890625, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, - "loss": 0.0248, - "macro_f1": 0.5492662787437439, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, - "routers_loss": 0.01693531684577465, + "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 @@ -14305,13 +14305,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, - "loss": 0.0197, + "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, - "routers_loss": 0.04939094930887222, + "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 @@ -14324,13 +14324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, - "loss": 0.0213, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, - "routers_loss": 0.0016892930725589395, + "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 @@ -14343,13 +14343,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, - "loss": 0.0147, + "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, - "routers_loss": 0.008671467192471027, + "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 @@ -14362,13 +14362,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, - "loss": 0.0228, + "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, - "routers_loss": 0.042682576924562454, + "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 @@ -14381,13 +14381,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, - "loss": 0.028, + "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, - "routers_loss": 0.02905822917819023, + "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 @@ -14400,13 +14400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, - "loss": 0.0223, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, - "routers_loss": 0.0018810008186846972, + "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 @@ -14419,13 +14419,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, - "loss": 0.0219, + "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, - "routers_loss": 0.04308714717626572, + "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 @@ -14438,13 +14438,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, - "loss": 0.0232, + "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, - "routers_loss": 0.003754811594262719, + "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 @@ -14457,32 +14457,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, - "loss": 0.0246, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, - "routers_loss": 0.010853761807084084, + "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 7.154975051364837, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, - "loss": 0.0196, - "macro_f1": 1.0, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, - "routers_loss": 0.015100379474461079, + "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 @@ -14495,13 +14495,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, - "loss": 0.0189, + "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, - "routers_loss": 0.011991916224360466, + "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 @@ -14514,32 +14514,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, - "loss": 0.0226, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, - "routers_loss": 0.008201062679290771, + "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.183152333431171, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1181640625, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, - "loss": 0.0174, - "macro_f1": 0.6666666865348816, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, - "routers_loss": 0.008513177745044231, + "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 @@ -14552,13 +14552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, - "loss": 0.02, + "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, - "routers_loss": 0.004850955214351416, + "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 @@ -14571,32 +14571,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.087890625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, - "loss": 0.0206, + "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, - "routers_loss": 0.0027650354895740747, + "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, - "acc_skip": 0.0, - "avg_layers": 29.0, + "acc_skip": 0.5, + "avg_layers": 28.0, "epoch": 7.2113296154975055, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, - "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, - "loss": 0.0209, - "macro_f1": 0.6538461446762085, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, - "routers_loss": 0.023268593475222588, + "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 @@ -14609,13 +14609,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, - "routers_loss": 0.031235001981258392, + "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 @@ -14628,13 +14628,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.080078125, + "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, - "loss": 0.023, + "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, - "routers_loss": 0.042398080229759216, + "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 @@ -14647,13 +14647,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.099609375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, - "loss": 0.0268, + "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, - "routers_loss": 0.007361465133726597, + "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 @@ -14666,13 +14666,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, - "loss": 0.0166, + "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, - "routers_loss": 0.0052334014326334, + "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 @@ -14685,13 +14685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, - "routers_loss": 0.004993532784283161, + "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 @@ -14704,13 +14704,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, - "loss": 0.0248, + "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, - "routers_loss": 0.001611889572814107, + "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 @@ -14723,13 +14723,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, - "loss": 0.0209, + "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, - "routers_loss": 0.03059260919690132, + "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 @@ -14742,32 +14742,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, - "loss": 0.0195, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, - "routers_loss": 0.00508903618901968, + "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.295861461696507, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, + "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, - "loss": 0.0174, - "macro_f1": 0.3272727429866791, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, - "routers_loss": 0.007860450074076653, + "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 @@ -14780,13 +14780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, - "loss": 0.0217, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, - "routers_loss": 0.0027370608877390623, + "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 @@ -14799,13 +14799,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, - "loss": 0.0276, + "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, - "routers_loss": 0.061584725975990295, + "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 @@ -14818,13 +14818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, - "routers_loss": 0.01694316789507866, + "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 @@ -14837,13 +14837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, - "loss": 0.0234, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, - "routers_loss": 0.0023331891279667616, + "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 @@ -14856,13 +14856,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, - "loss": 0.0203, + "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, - "routers_loss": 0.021300682798027992, + "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 @@ -14875,13 +14875,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, - "loss": 0.026, + "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, - "routers_loss": 0.08335043489933014, + "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 @@ -14894,13 +14894,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, - "loss": 0.0224, + "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, - "routers_loss": 0.003535634372383356, + "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 @@ -14913,13 +14913,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, - "routers_loss": 0.025729363784193993, + "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 @@ -14932,13 +14932,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, - "loss": 0.0287, + "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, - "routers_loss": 0.059166863560676575, + "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 @@ -14951,13 +14951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, - "loss": 0.0173, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, - "routers_loss": 0.002580022206529975, + "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 @@ -14970,13 +14970,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, - "loss": 0.0257, + "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, - "routers_loss": 0.007746981456875801, + "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 @@ -14989,13 +14989,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, - "loss": 0.0344, + "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, - "routers_loss": 0.027308562770485878, + "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 @@ -15008,13 +15008,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, - "loss": 0.0277, + "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, - "routers_loss": 0.029863199219107628, + "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 @@ -15027,13 +15027,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, - "loss": 0.0218, + "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, - "routers_loss": 0.004019706044346094, + "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 @@ -15046,32 +15046,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, - "loss": 0.0239, + "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, - "routers_loss": 0.014162382110953331, + "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 7.446140299383622, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, - "loss": 0.0338, - "macro_f1": 0.32098764181137085, + "loss": 0.032, + "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, - "routers_loss": 0.023485012352466583, + "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 @@ -15079,37 +15079,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 7.455532726739067, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, - "loss": 0.0308, - "macro_f1": 0.3272727429866791, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, - "routers_loss": 0.05822715163230896, + "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { - "acc_repeat": 0.5, - "acc_skip": 0.5, + "acc_repeat": 1.0, + "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, - "f1_execute": 0.936170220375061, - "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.189453125, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, - "loss": 0.0243, - "macro_f1": 0.7565011978149414, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, - "routers_loss": 0.07448136061429977, + "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 @@ -15122,13 +15122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, - "loss": 0.0228, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, - "routers_loss": 0.0038424162194132805, + "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 @@ -15141,13 +15141,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, - "loss": 0.0277, + "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, - "routers_loss": 0.005674343090504408, + "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 @@ -15160,13 +15160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.189453125, + "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, - "loss": 0.0209, + "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, - "routers_loss": 0.015544800087809563, + "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 @@ -15179,13 +15179,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, - "loss": 0.0279, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, - "routers_loss": 0.013199405744671822, + "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 @@ -15198,13 +15198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, - "loss": 0.0178, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, - "routers_loss": 0.0032487998250871897, + "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 @@ -15217,13 +15217,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, - "loss": 0.0253, + "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, - "routers_loss": 0.0041928659193217754, + "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 @@ -15236,13 +15236,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, - "loss": 0.0343, + "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, - "routers_loss": 0.011576149612665176, + "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 @@ -15255,13 +15255,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, - "loss": 0.0179, + "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, - "routers_loss": 0.03026912547647953, + "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 @@ -15276,11 +15276,11 @@ "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, - "loss": 0.021, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, - "routers_loss": 0.014957098290324211, + "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 @@ -15293,13 +15293,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, - "loss": 0.0226, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, - "routers_loss": 0.00254683755338192, + "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 @@ -15312,13 +15312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, - "loss": 0.0286, + "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, - "routers_loss": 0.018759876489639282, + "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 @@ -15337,7 +15337,7 @@ "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, - "routers_loss": 0.022694367915391922, + "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 @@ -15350,13 +15350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, - "loss": 0.0181, + "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, - "routers_loss": 0.010102321393787861, + "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 @@ -15369,13 +15369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08984375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, - "loss": 0.0252, + "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, - "routers_loss": 0.0020994991064071655, + "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 @@ -15388,13 +15388,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.10009765625, + "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, - "loss": 0.0323, + "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, - "routers_loss": 0.11748704314231873, + "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 @@ -15407,32 +15407,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, - "loss": 0.018, + "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, - "routers_loss": 0.007642311509698629, + "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.62459641913707, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "f1_skip": 1.0, + "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, - "loss": 0.0258, - "macro_f1": 0.3272727429866791, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, - "routers_loss": 0.016890225932002068, + "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 @@ -15445,13 +15445,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, - "loss": 0.0242, + "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, - "routers_loss": 0.010900186374783516, + "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 @@ -15464,13 +15464,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, - "loss": 0.0279, + "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, - "routers_loss": 0.011229799129068851, + "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 @@ -15483,13 +15483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, - "loss": 0.0275, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, - "routers_loss": 0.0105878422036767, + "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 @@ -15502,13 +15502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, - "loss": 0.0209, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, - "routers_loss": 0.002953991526737809, + "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 @@ -15521,13 +15521,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, - "loss": 0.0241, + "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, - "routers_loss": 0.04717390984296799, + "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 @@ -15540,13 +15540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, - "loss": 0.0179, + "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, - "routers_loss": 0.0073657832108438015, + "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 @@ -15559,13 +15559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, - "loss": 0.0259, + "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, - "routers_loss": 0.005212752148509026, + "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 @@ -15578,13 +15578,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, - "loss": 0.0304, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, - "routers_loss": 0.04311618581414223, + "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 @@ -15597,13 +15597,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, - "loss": 0.039, + "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, - "routers_loss": 0.02027471922338009, + "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 @@ -15616,13 +15616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, - "loss": 0.0279, + "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, - "routers_loss": 0.003074956126511097, + "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 @@ -15635,13 +15635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, - "loss": 0.0241, + "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, - "routers_loss": 0.009374706074595451, + "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 @@ -15654,13 +15654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, - "routers_loss": 0.01204691268503666, + "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 @@ -15673,13 +15673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, - "routers_loss": 0.01376053225249052, + "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 @@ -15692,13 +15692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, - "loss": 0.0285, + "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, - "routers_loss": 0.009346984326839447, + "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 @@ -15711,13 +15711,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, - "loss": 0.0291, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, - "routers_loss": 0.002724624238908291, + "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 @@ -15730,13 +15730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.123046875, + "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, - "loss": 0.0271, + "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, - "routers_loss": 0.00823777075856924, + "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 @@ -15749,32 +15749,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, - "loss": 0.0262, + "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, - "routers_loss": 0.004393119364976883, + "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.7936601115350745, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1513671875, + "f1_skip": 0.0, + "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, - "loss": 0.024, - "macro_f1": 0.5492662787437439, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, - "routers_loss": 0.031827569007873535, + "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 @@ -15789,11 +15789,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, - "loss": 0.0216, + "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, - "routers_loss": 0.03329647704958916, + "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 @@ -15806,13 +15806,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, - "loss": 0.0415, + "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, - "routers_loss": 0.047317031770944595, + "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 @@ -15827,11 +15827,11 @@ "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, - "loss": 0.0325, + "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, - "routers_loss": 0.05649980902671814, + "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 @@ -15844,13 +15844,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, - "loss": 0.0229, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, - "routers_loss": 0.01004624180495739, + "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 @@ -15863,13 +15863,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, - "loss": 0.0194, + "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, - "routers_loss": 0.0054973396472632885, + "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 @@ -15882,13 +15882,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09619140625, + "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, - "loss": 0.031, + "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, - "routers_loss": 0.05615650862455368, + "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 @@ -15901,13 +15901,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, - "loss": 0.0301, + "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, - "routers_loss": 0.012819192372262478, + "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 @@ -15920,13 +15920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, - "loss": 0.0253, + "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, - "routers_loss": 0.07059332728385925, + "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 @@ -15939,13 +15939,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, - "loss": 0.0198, + "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, - "routers_loss": 0.029778441414237022, + "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 @@ -15958,13 +15958,13 @@ "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, - "grad_norm": 0.1416015625, + "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, - "loss": 0.0474, + "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, - "routers_loss": 0.04371272772550583, + "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 @@ -15977,13 +15977,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, - "loss": 0.0293, + "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, - "routers_loss": 0.0033591394312679768, + "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 @@ -15996,13 +15996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, - "loss": 0.0259, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, - "routers_loss": 0.005085585173219442, + "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 @@ -16015,13 +16015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, - "loss": 0.0243, + "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, - "routers_loss": 0.008569681085646152, + "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 @@ -16034,32 +16034,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, - "loss": 0.022, + "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, - "routers_loss": 0.024587804451584816, + "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, - "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, "epoch": 7.934546521866745, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.169921875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, - "loss": 0.0332, - "macro_f1": 0.4871794879436493, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, - "routers_loss": 0.037516288459300995, + "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 @@ -16072,13 +16072,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, - "loss": 0.0262, + "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, - "routers_loss": 0.01287431176751852, + "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 @@ -16091,13 +16091,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07763671875, + "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, - "loss": 0.0227, + "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, - "routers_loss": 0.015499880537390709, + "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 @@ -16110,13 +16110,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, - "loss": 0.0144, + "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, - "routers_loss": 0.049878787249326706, + "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 @@ -16129,13 +16129,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, - "loss": 0.0206, + "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, - "routers_loss": 0.04108169302344322, + "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 @@ -16148,13 +16148,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, - "loss": 0.0367, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, - "routers_loss": 0.007651917636394501, + "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 @@ -16167,13 +16167,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, - "routers_loss": 0.015448091551661491, + "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 @@ -16186,13 +16186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, - "loss": 0.0324, + "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, - "routers_loss": 0.009139287285506725, + "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 @@ -16205,13 +16205,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0869140625, + "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, - "loss": 0.0191, + "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, - "routers_loss": 0.015412120148539543, + "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 @@ -16224,13 +16224,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, - "routers_loss": 0.012735052965581417, + "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 @@ -16243,13 +16243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, - "loss": 0.0192, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, - "routers_loss": 0.00114025070797652, + "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 @@ -16262,13 +16262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, - "loss": 0.0243, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, - "routers_loss": 0.006401443853974342, + "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 @@ -16281,13 +16281,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, - "loss": 0.0185, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, - "routers_loss": 0.004316259175539017, + "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 @@ -16300,13 +16300,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, - "loss": 0.0201, + "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, - "routers_loss": 0.043461959809064865, + "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 @@ -16319,13 +16319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, - "routers_loss": 0.0024530428927391768, + "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 @@ -16338,13 +16338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, - "loss": 0.026, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, - "routers_loss": 0.0046626063995063305, + "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 @@ -16357,13 +16357,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1513671875, + "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, - "loss": 0.0257, + "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, - "routers_loss": 0.02480102889239788, + "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 @@ -16376,13 +16376,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, - "loss": 0.02, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, - "routers_loss": 0.012629947625100613, + "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 @@ -16395,13 +16395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, - "routers_loss": 0.0024127380456775427, + "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 @@ -16414,13 +16414,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, - "loss": 0.0162, + "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, - "routers_loss": 0.0415453165769577, + "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 @@ -16433,13 +16433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, - "loss": 0.0211, + "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, - "routers_loss": 0.0056681083515286446, + "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 @@ -16452,13 +16452,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, - "loss": 0.0189, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, - "routers_loss": 0.006391602102667093, + "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 @@ -16471,13 +16471,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, - "loss": 0.0229, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, - "routers_loss": 0.007466991897672415, + "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 @@ -16490,13 +16490,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, - "loss": 0.0197, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, - "routers_loss": 0.001953453291207552, + "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 @@ -16509,13 +16509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, - "loss": 0.0223, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, - "routers_loss": 0.003612719476222992, + "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 @@ -16528,13 +16528,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, - "routers_loss": 0.009977150708436966, + "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 @@ -16547,13 +16547,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.10791015625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, - "routers_loss": 0.026468059048056602, + "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 @@ -16566,13 +16566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, - "routers_loss": 0.00849854201078415, + "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 @@ -16585,13 +16585,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, - "loss": 0.0275, + "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, - "routers_loss": 0.08082596957683563, + "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 @@ -16604,13 +16604,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.046142578125, + "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, - "loss": 0.015, + "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, - "routers_loss": 0.029500717297196388, + "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 @@ -16623,13 +16623,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, - "loss": 0.0183, + "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, - "routers_loss": 0.025238536298274994, + "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 @@ -16642,13 +16642,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, - "loss": 0.0204, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, - "routers_loss": 0.002069319598376751, + "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 @@ -16661,13 +16661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, - "loss": 0.0169, + "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, - "routers_loss": 0.002853946527466178, + "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 @@ -16680,13 +16680,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, - "loss": 0.0236, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, - "routers_loss": 0.0100983502343297, + "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 @@ -16699,13 +16699,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, - "loss": 0.0173, + "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, - "routers_loss": 0.031218983232975006, + "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 @@ -16718,13 +16718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, - "loss": 0.019, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, - "routers_loss": 0.010347879491746426, + "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 @@ -16737,13 +16737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, - "loss": 0.0193, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, - "routers_loss": 0.007768871728330851, + "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 @@ -16756,13 +16756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, - "loss": 0.0253, + "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, - "routers_loss": 0.002887974726036191, + "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 @@ -16777,11 +16777,11 @@ "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, - "loss": 0.0147, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, - "routers_loss": 0.0027294005267322063, + "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 @@ -16794,13 +16794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, - "loss": 0.0193, + "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, - "routers_loss": 0.00543541694059968, + "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 @@ -16813,13 +16813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, - "routers_loss": 0.006514009553939104, + "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 @@ -16832,13 +16832,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.06396484375, + "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, - "loss": 0.019, + "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, - "routers_loss": 0.02333846502006054, + "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 @@ -16851,13 +16851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, - "routers_loss": 0.004471905063837767, + "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 @@ -16870,13 +16870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, - "loss": 0.0261, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, - "routers_loss": 0.0024362702388316393, + "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 @@ -16889,13 +16889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, - "loss": 0.0116, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, - "routers_loss": 0.0021166049409657717, + "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 @@ -16908,13 +16908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, - "routers_loss": 0.004722296260297298, + "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 @@ -16929,11 +16929,11 @@ "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, - "loss": 0.0199, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, - "routers_loss": 0.014188882894814014, + "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 @@ -16946,13 +16946,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.076171875, + "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, - "loss": 0.0241, + "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, - "routers_loss": 0.04599560424685478, + "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 @@ -16965,13 +16965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, - "loss": 0.0216, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, - "routers_loss": 0.004942454397678375, + "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 @@ -16984,13 +16984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, - "loss": 0.021, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, - "routers_loss": 0.0020542226266115904, + "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 @@ -17003,13 +17003,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1171875, + "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, - "loss": 0.0155, + "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, - "routers_loss": 0.0397041030228138, + "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 @@ -17022,13 +17022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, - "loss": 0.0204, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, - "routers_loss": 0.0017490010941401124, + "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 @@ -17041,13 +17041,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, - "loss": 0.021, + "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, - "routers_loss": 0.023590171709656715, + "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 @@ -17060,13 +17060,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, - "loss": 0.0125, + "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, - "routers_loss": 0.02458076737821102, + "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 @@ -17079,13 +17079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, - "loss": 0.019, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, - "routers_loss": 0.0014341498026624322, + "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 @@ -17098,13 +17098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, - "loss": 0.02, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, - "routers_loss": 0.00213223067112267, + "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 @@ -17117,13 +17117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, - "loss": 0.0147, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, - "routers_loss": 0.0027340995147824287, + "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 @@ -17136,32 +17136,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, - "loss": 0.0172, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, - "routers_loss": 0.0035587961319833994, + "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 8.479013795127678, - "f1_execute": 0.9615384340286255, - "f1_repeat": 0.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, - "loss": 0.0187, - "macro_f1": 0.5427350401878357, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, - "routers_loss": 0.04446055367588997, + "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 @@ -17174,13 +17174,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, - "loss": 0.0244, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, - "routers_loss": 0.05095123499631882, + "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 @@ -17193,13 +17193,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, - "loss": 0.0232, + "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, - "routers_loss": 0.008440068922936916, + "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 @@ -17212,13 +17212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, - "loss": 0.0273, + "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, - "routers_loss": 0.012230116873979568, + "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 @@ -17231,13 +17231,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, - "loss": 0.026, + "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, - "routers_loss": 0.017307188361883163, + "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 @@ -17250,13 +17250,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, - "loss": 0.0215, + "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, - "routers_loss": 0.07191162556409836, + "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 @@ -17269,13 +17269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, - "loss": 0.0182, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, - "routers_loss": 0.008753604255616665, + "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 @@ -17288,13 +17288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, - "loss": 0.0233, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, - "routers_loss": 0.008390828967094421, + "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 @@ -17307,13 +17307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, - "routers_loss": 0.005617359187453985, + "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 @@ -17326,13 +17326,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, - "loss": 0.0227, + "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, - "routers_loss": 0.0346846878528595, + "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 @@ -17345,32 +17345,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, - "routers_loss": 0.006601692643016577, + "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 8.582330496037569, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, - "loss": 0.0207, - "macro_f1": 0.6666666865348816, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, - "routers_loss": 0.0065619745291769505, + "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 @@ -17383,13 +17383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, - "loss": 0.0149, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, - "routers_loss": 0.011109639890491962, + "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 @@ -17404,11 +17404,11 @@ "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, - "loss": 0.0167, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, - "routers_loss": 0.008432094007730484, + "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 @@ -17421,13 +17421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, - "loss": 0.0208, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, - "routers_loss": 0.011518111452460289, + "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 @@ -17440,13 +17440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, - "loss": 0.0211, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, - "routers_loss": 0.0026744187343865633, + "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 @@ -17459,13 +17459,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, - "loss": 0.0165, + "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, - "routers_loss": 0.028107430785894394, + "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 @@ -17478,13 +17478,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.095703125, + "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, - "loss": 0.0263, + "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, - "routers_loss": 0.060007549822330475, + "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 @@ -17497,13 +17497,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, - "routers_loss": 0.01074182614684105, + "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 @@ -17516,13 +17516,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, - "loss": 0.0213, + "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, - "routers_loss": 0.0019638657104223967, + "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 @@ -17535,13 +17535,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1572265625, + "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, - "loss": 0.0156, + "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, - "routers_loss": 0.06953249871730804, + "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 @@ -17554,13 +17554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, - "loss": 0.0154, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, - "routers_loss": 0.003563052974641323, + "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 @@ -17573,13 +17573,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, - "loss": 0.0216, + "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, - "routers_loss": 0.010409255512058735, + "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 @@ -17592,13 +17592,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, - "loss": 0.0195, + "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, - "routers_loss": 0.009769548662006855, + "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 @@ -17611,13 +17611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, - "loss": 0.0184, + "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, - "routers_loss": 0.036616452038288116, + "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 @@ -17630,13 +17630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, - "loss": 0.0192, + "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, - "routers_loss": 0.009581349790096283, + "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 @@ -17649,13 +17649,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, - "loss": 0.0138, + "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, - "routers_loss": 0.02330300398170948, + "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 @@ -17668,13 +17668,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, - "loss": 0.0226, + "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, - "routers_loss": 0.011985735036432743, + "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 @@ -17687,13 +17687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, - "loss": 0.0162, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, - "routers_loss": 0.005997250322252512, + "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 @@ -17706,13 +17706,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, - "loss": 0.0243, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, - "routers_loss": 0.004814761225134134, + "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 @@ -17725,13 +17725,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, - "loss": 0.0242, + "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, - "routers_loss": 0.004750931169837713, + "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 @@ -17744,13 +17744,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, - "loss": 0.0216, + "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, - "routers_loss": 0.038251202553510666, + "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 @@ -17763,13 +17763,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06640625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, - "loss": 0.0204, + "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, - "routers_loss": 0.010951942764222622, + "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 @@ -17782,13 +17782,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, - "loss": 0.0265, + "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, - "routers_loss": 0.06582094728946686, + "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 @@ -17796,18 +17796,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 8.798356325212797, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "f1_skip": 1.0, + "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, - "loss": 0.0143, - "macro_f1": 0.5492662787437439, + "loss": 0.014, + "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, - "routers_loss": 0.008920271880924702, + "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 @@ -17820,13 +17820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, - "routers_loss": 0.006444344762712717, + "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 @@ -17839,13 +17839,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, - "loss": 0.022, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, - "routers_loss": 0.05197767913341522, + "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 @@ -17858,13 +17858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, - "loss": 0.0224, + "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, - "routers_loss": 0.017570581287145615, + "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 @@ -17877,13 +17877,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, - "loss": 0.0225, + "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, - "routers_loss": 0.07106777280569077, + "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 @@ -17896,13 +17896,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, - "loss": 0.0139, + "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, - "routers_loss": 0.009862381964921951, + "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 @@ -17915,13 +17915,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, - "loss": 0.0209, + "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, - "routers_loss": 0.006928981747478247, + "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 @@ -17934,13 +17934,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09423828125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, - "routers_loss": 0.04788029566407204, + "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 @@ -17953,13 +17953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, - "loss": 0.0299, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, - "routers_loss": 0.008282946422696114, + "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 @@ -17972,32 +17972,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.09716796875, + "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, - "loss": 0.0181, + "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, - "routers_loss": 0.03251546248793602, + "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 8.892280598767243, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06640625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, - "loss": 0.0195, - "macro_f1": 0.542222261428833, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, - "routers_loss": 0.03368280455470085, + "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 @@ -18010,13 +18010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, - "routers_loss": 0.0069940583780407906, + "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 @@ -18029,13 +18029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, - "loss": 0.0221, + "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, - "routers_loss": 0.004268508404493332, + "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 @@ -18048,13 +18048,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, - "loss": 0.0159, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, - "routers_loss": 0.0032616283278912306, + "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 @@ -18067,13 +18067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, - "loss": 0.023, + "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, - "routers_loss": 0.005389219615608454, + "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 @@ -18086,13 +18086,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, - "loss": 0.0311, + "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, - "routers_loss": 0.024814991280436516, + "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 @@ -18107,11 +18107,11 @@ "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, - "loss": 0.0151, + "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, - "routers_loss": 0.013415839523077011, + "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 @@ -18124,13 +18124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, - "loss": 0.019, + "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, - "routers_loss": 0.005814475007355213, + "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 @@ -18143,13 +18143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, - "loss": 0.0218, + "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, - "routers_loss": 0.007621586322784424, + "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 @@ -18162,13 +18162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, - "loss": 0.0178, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, - "routers_loss": 0.0020917020738124847, + "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 @@ -18181,13 +18181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, - "loss": 0.025, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, - "routers_loss": 0.0010824954370036721, + "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 @@ -18200,13 +18200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, - "routers_loss": 0.0018075350672006607, + "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 @@ -18219,13 +18219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, - "loss": 0.0235, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, - "routers_loss": 0.0032930250745266676, + "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 @@ -18238,13 +18238,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, - "loss": 0.0184, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, - "routers_loss": 0.009042349644005299, + "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 @@ -18257,13 +18257,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1103515625, + "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, - "loss": 0.022, + "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, - "routers_loss": 0.016776500269770622, + "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 @@ -18276,13 +18276,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05029296875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, - "loss": 0.016, + "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, - "routers_loss": 0.06579705327749252, + "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 @@ -18295,13 +18295,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, - "routers_loss": 0.0022786022163927555, + "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 @@ -18314,13 +18314,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, - "routers_loss": 0.01074281521141529, + "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 @@ -18333,13 +18333,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, - "loss": 0.0201, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, - "routers_loss": 0.0032052614260464907, + "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 @@ -18352,32 +18352,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, - "loss": 0.0186, + "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, - "routers_loss": 0.008593574166297913, + "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.079835632521279, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.07373046875, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, - "loss": 0.0152, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, - "routers_loss": 0.0201246440410614, + "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 @@ -18390,13 +18390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, - "loss": 0.0199, + "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, - "routers_loss": 0.001721356064081192, + "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 @@ -18409,13 +18409,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, - "loss": 0.0135, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, - "routers_loss": 0.010442634113132954, + "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 @@ -18428,13 +18428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, - "loss": 0.019, + "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, - "routers_loss": 0.0009493798715993762, + "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 @@ -18447,13 +18447,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, - "loss": 0.0149, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, - "routers_loss": 0.022104881703853607, + "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 @@ -18466,13 +18466,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, - "loss": 0.0164, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, - "routers_loss": 0.0009013625676743686, + "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 @@ -18485,13 +18485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, - "loss": 0.0121, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, - "routers_loss": 0.0028069843538105488, + "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 @@ -18504,13 +18504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, - "loss": 0.0116, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, - "routers_loss": 0.006877045147120953, + "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 @@ -18523,13 +18523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, - "routers_loss": 0.004543667659163475, + "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 @@ -18542,13 +18542,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, - "loss": 0.0232, + "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, - "routers_loss": 0.007053609937429428, + "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 @@ -18561,13 +18561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, - "routers_loss": 0.006783280987292528, + "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 @@ -18580,13 +18580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, - "loss": 0.0181, + "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, - "routers_loss": 0.002531677018851042, + "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 @@ -18599,32 +18599,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, - "loss": 0.0154, + "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, - "routers_loss": 0.027040868997573853, + "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.201937188142061, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, - "loss": 0.0154, - "macro_f1": 0.6601307392120361, + "loss": 0.0158, + "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, - "routers_loss": 0.01573321223258972, + "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 @@ -18637,13 +18637,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, - "routers_loss": 0.02442278526723385, + "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 @@ -18656,13 +18656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, - "loss": 0.0142, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, - "routers_loss": 0.018267054110765457, + "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 @@ -18675,13 +18675,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, - "loss": 0.0162, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, - "routers_loss": 0.0016024474753066897, + "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 @@ -18694,13 +18694,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, - "routers_loss": 0.004668849054723978, + "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 @@ -18713,13 +18713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, - "loss": 0.0098, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, - "routers_loss": 0.0011657609138637781, + "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 @@ -18732,13 +18732,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.04248046875, + "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, - "loss": 0.0133, + "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, - "routers_loss": 0.03806794434785843, + "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 @@ -18751,13 +18751,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, - "loss": 0.0189, + "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, - "routers_loss": 0.005107097327709198, + "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 @@ -18770,13 +18770,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, - "routers_loss": 0.0013696947135031223, + "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 @@ -18789,13 +18789,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, - "loss": 0.0136, + "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, - "routers_loss": 0.0012224154779687524, + "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 @@ -18808,13 +18808,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, - "routers_loss": 0.0030119111761450768, + "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 @@ -18827,13 +18827,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, - "loss": 0.0222, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, - "routers_loss": 0.04286033287644386, + "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 @@ -18846,32 +18846,32 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, - "loss": 0.0158, + "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, - "routers_loss": 0.019988590851426125, + "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.324038743762841, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, - "loss": 0.0154, - "macro_f1": 0.3272727429866791, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, - "routers_loss": 0.012215938419103622, + "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 @@ -18884,13 +18884,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, - "routers_loss": 0.10747655481100082, + "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 @@ -18903,13 +18903,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, - "loss": 0.0186, + "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, - "routers_loss": 0.016109853982925415, + "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 @@ -18922,13 +18922,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, - "loss": 0.0116, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, - "routers_loss": 0.006929324474185705, + "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 @@ -18941,13 +18941,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, - "routers_loss": 0.0715102106332779, + "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 @@ -18960,13 +18960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, - "loss": 0.0187, + "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, - "routers_loss": 0.008499355986714363, + "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 @@ -18979,13 +18979,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, - "loss": 0.0162, + "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, - "routers_loss": 0.012003371492028236, + "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 @@ -18998,13 +18998,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, - "loss": 0.0137, + "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, - "routers_loss": 0.0529167577624321, + "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 @@ -19017,13 +19017,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.07568359375, "learning_rate": 0.0009470039504537173, - "loss": 0.0185, + "loss": 0.0186, "macro_f1": 0.31446540355682373, "num_tokens": 3230031.0, "repeat_count": 0.0, - "routers_loss": 0.05719539523124695, + "routers_loss": 0.052884332835674286, "skip_count": 2.0, "step": 2002, "text_loss": 0.1741616576910019 @@ -19038,11 +19038,11 @@ "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009468651855706931, - "loss": 0.0205, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 3232991.0, "repeat_count": 1.0, - "routers_loss": 0.007613501511514187, + "routers_loss": 0.008056716993451118, "skip_count": 0.0, "step": 2004, "text_loss": 0.3173636198043823 @@ -19055,13 +19055,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0654296875, "learning_rate": 0.0009467262494480868, - "loss": 0.014, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3236390.0, "repeat_count": 0.0, - "routers_loss": 0.005654903594404459, + "routers_loss": 0.0053409393876791, "skip_count": 0.0, "step": 2006, "text_loss": 0.5806330442428589 @@ -19074,13 +19074,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.068359375, "learning_rate": 0.000946587142139139, - "loss": 0.0152, + "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 3239267.0, "repeat_count": 0.0, - "routers_loss": 0.001680699409916997, + "routers_loss": 0.0015652200672775507, "skip_count": 0.0, "step": 2008, "text_loss": 0.6214317679405212 @@ -19093,13 +19093,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.11376953125, "learning_rate": 0.000946447863697156, - "loss": 0.0171, + "loss": 0.0151, "macro_f1": 0.6601307392120361, "num_tokens": 3242569.0, "repeat_count": 1.0, - "routers_loss": 0.014179535210132599, + "routers_loss": 0.011673987843096256, "skip_count": 2.0, "step": 2010, "text_loss": 0.532565712928772 @@ -19112,13 +19112,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04345703125, "learning_rate": 0.0009463084141755093, - "loss": 0.0157, + "loss": 0.0159, "macro_f1": 0.3272727429866791, "num_tokens": 3245669.0, "repeat_count": 0.0, - "routers_loss": 0.026209332048892975, + "routers_loss": 0.028480790555477142, "skip_count": 1.0, "step": 2012, "text_loss": 0.25210800766944885 @@ -19131,13 +19131,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009461687936276364, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3248751.0, "repeat_count": 0.0, - "routers_loss": 0.008315940387547016, + "routers_loss": 0.007234727032482624, "skip_count": 0.0, "step": 2014, "text_loss": 0.35922971367836 @@ -19150,13 +19150,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.068359375, "learning_rate": 0.0009460290021070402, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6666666865348816, "num_tokens": 3252614.0, "repeat_count": 1.0, - "routers_loss": 0.01872348040342331, + "routers_loss": 0.014691276475787163, "skip_count": 0.0, "step": 2016, "text_loss": 0.2747853398323059 @@ -19169,13 +19169,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009458890396672888, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3256374.0, "repeat_count": 0.0, - "routers_loss": 0.0024314222391694784, + "routers_loss": 0.002385235857218504, "skip_count": 0.0, "step": 2018, "text_loss": 0.5268719792366028 @@ -19188,13 +19188,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052978515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009457489063620164, - "loss": 0.0137, + "loss": 0.0133, "macro_f1": 0.8823530077934265, "num_tokens": 3259792.0, "repeat_count": 1.0, - "routers_loss": 0.04815426841378212, + "routers_loss": 0.047268565744161606, "skip_count": 2.0, "step": 2020, "text_loss": 0.7785539627075195 @@ -19207,13 +19207,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1494140625, "learning_rate": 0.0009456086022449221, - "loss": 0.0209, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 3262833.0, "repeat_count": 0.0, - "routers_loss": 0.015121756121516228, + "routers_loss": 0.015878718346357346, "skip_count": 1.0, "step": 2022, "text_loss": 0.42270028591156006 @@ -19226,32 +19226,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.08935546875, "learning_rate": 0.0009454681273697711, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3272727429866791, "num_tokens": 3265718.0, "repeat_count": 1.0, - "routers_loss": 0.030219297856092453, + "routers_loss": 0.030749641358852386, "skip_count": 0.0, "step": 2024, "text_loss": 0.18668225407600403 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.511887290871735, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, "learning_rate": 0.0009453274817903931, - "loss": 0.0132, - "macro_f1": 0.3272727429866791, + "loss": 0.012, + "macro_f1": 0.6666666865348816, "num_tokens": 3268158.0, "repeat_count": 0.0, - "routers_loss": 0.013256299309432507, + "routers_loss": 0.011538166552782059, "skip_count": 1.0, "step": 2026, "text_loss": 0.34090787172317505 @@ -19264,13 +19264,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.099609375, "learning_rate": 0.000945186665560684, - "loss": 0.0232, + "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 3271082.0, "repeat_count": 0.0, - "routers_loss": 0.009389489889144897, + "routers_loss": 0.009527760557830334, "skip_count": 0.0, "step": 2028, "text_loss": 0.2110334187746048 @@ -19283,13 +19283,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.119140625, "learning_rate": 0.000945045678734605, - "loss": 0.0178, + "loss": 0.0175, "macro_f1": 0.3144654333591461, "num_tokens": 3273488.0, "repeat_count": 0.0, - "routers_loss": 0.03916877508163452, + "routers_loss": 0.03317151218652725, "skip_count": 3.0, "step": 2030, "text_loss": 0.2233227640390396 @@ -19302,13 +19302,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12451171875, "learning_rate": 0.0009449045213661822, - "loss": 0.0215, + "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 3276646.0, "repeat_count": 0.0, - "routers_loss": 0.019781047478318214, + "routers_loss": 0.018510591238737106, "skip_count": 1.0, "step": 2032, "text_loss": 0.16100332140922546 @@ -19321,13 +19321,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.11474609375, + "grad_norm": 0.1318359375, "learning_rate": 0.0009447631935095077, - "loss": 0.0193, + "loss": 0.0185, "macro_f1": 0.9452888369560242, "num_tokens": 3279441.0, "repeat_count": 1.0, - "routers_loss": 0.02645993046462536, + "routers_loss": 0.028113311156630516, "skip_count": 4.0, "step": 2034, "text_loss": 0.29208317399024963 @@ -19340,13 +19340,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009446216952187384, - "loss": 0.0168, + "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3282697.0, "repeat_count": 0.0, - "routers_loss": 0.008575125597417355, + "routers_loss": 0.008379172533750534, "skip_count": 0.0, "step": 2036, "text_loss": 0.16026398539543152 @@ -19359,13 +19359,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009444800265480967, - "loss": 0.0184, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3285574.0, "repeat_count": 0.0, - "routers_loss": 0.01042154710739851, + "routers_loss": 0.00941354501992464, "skip_count": 0.0, "step": 2038, "text_loss": 0.29523080587387085 @@ -19378,13 +19378,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.8571428656578064, "f1_skip": 0.800000011920929, - "grad_norm": 0.07568359375, + "grad_norm": 0.076171875, "learning_rate": 0.0009443381875518703, - "loss": 0.0206, + "loss": 0.0197, "macro_f1": 0.8600732684135437, "num_tokens": 3289159.0, "repeat_count": 4.0, - "routers_loss": 0.05496715381741524, + "routers_loss": 0.04974055662751198, "skip_count": 6.0, "step": 2040, "text_loss": 0.23033179342746735 @@ -19397,13 +19397,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.0537109375, "learning_rate": 0.0009441961782844123, - "loss": 0.0149, + "loss": 0.0146, "macro_f1": 0.3272727429866791, "num_tokens": 3293598.0, "repeat_count": 0.0, - "routers_loss": 0.021722445264458656, + "routers_loss": 0.022241825237870216, "skip_count": 1.0, "step": 2042, "text_loss": 0.8299165368080139 @@ -19416,13 +19416,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009440539988001408, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3296648.0, "repeat_count": 0.0, - "routers_loss": 0.011090370826423168, + "routers_loss": 0.011019332334399223, "skip_count": 0.0, "step": 2044, "text_loss": 0.18207129836082458 @@ -19435,13 +19435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009439116491535394, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3300058.0, "repeat_count": 0.0, - "routers_loss": 0.00327755743637681, + "routers_loss": 0.002889640862122178, "skip_count": 0.0, "step": 2046, "text_loss": 0.7051978707313538 @@ -19454,13 +19454,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.5, "f1_skip": 0.8571428656578064, - "grad_norm": 0.08154296875, + "grad_norm": 0.078125, "learning_rate": 0.0009437691293991563, - "loss": 0.0198, + "loss": 0.0192, "macro_f1": 0.7634921073913574, "num_tokens": 3303296.0, "repeat_count": 3.0, - "routers_loss": 0.0807223841547966, + "routers_loss": 0.07741832733154297, "skip_count": 4.0, "step": 2048, "text_loss": 0.15563532710075378 @@ -19473,13 +19473,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.09521484375, "learning_rate": 0.0009436264395916061, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 3306204.0, "repeat_count": 0.0, - "routers_loss": 0.014681774191558361, + "routers_loss": 0.014225383289158344, "skip_count": 2.0, "step": 2050, "text_loss": 0.18117287755012512 @@ -19492,13 +19492,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1416015625, "learning_rate": 0.0009434835797855672, - "loss": 0.0166, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 3309444.0, "repeat_count": 0.0, - "routers_loss": 0.0025602662935853004, + "routers_loss": 0.0023932650219649076, "skip_count": 0.0, "step": 2052, "text_loss": 0.4645874798297882 @@ -19511,13 +19511,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05810546875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009433405500357839, - "loss": 0.0148, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3312488.0, "repeat_count": 0.0, - "routers_loss": 0.03283753618597984, + "routers_loss": 0.03193361684679985, "skip_count": 1.0, "step": 2054, "text_loss": 0.5291082859039307 @@ -19530,13 +19530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009431973503970655, - "loss": 0.0138, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3315765.0, "repeat_count": 0.0, - "routers_loss": 0.002137230010703206, + "routers_loss": 0.0020529816392809153, "skip_count": 0.0, "step": 2056, "text_loss": 0.5877931118011475 @@ -19549,13 +19549,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009430539809242864, - "loss": 0.0199, + "loss": 0.0185, "macro_f1": 0.32098764181137085, "num_tokens": 3318877.0, "repeat_count": 2.0, - "routers_loss": 0.07938452064990997, + "routers_loss": 0.07907948642969131, "skip_count": 0.0, "step": 2058, "text_loss": 0.3836737871170044 @@ -19568,13 +19568,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.095703125, "learning_rate": 0.0009429104416723862, - "loss": 0.0164, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 3322576.0, "repeat_count": 2.0, - "routers_loss": 0.003832251997664571, + "routers_loss": 0.003006070153787732, "skip_count": 0.0, "step": 2060, "text_loss": 0.3480920195579529 @@ -19587,13 +19587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.045166015625, "learning_rate": 0.0009427667326963689, - "loss": 0.0131, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3325974.0, "repeat_count": 0.0, - "routers_loss": 0.006192604545503855, + "routers_loss": 0.005013179033994675, "skip_count": 0.0, "step": 2062, "text_loss": 0.931358814239502 @@ -19606,13 +19606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009426228540513047, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 3329398.0, "repeat_count": 0.0, - "routers_loss": 0.008115313947200775, + "routers_loss": 0.0059848143719136715, "skip_count": 0.0, "step": 2064, "text_loss": 0.47568953037261963 @@ -19625,13 +19625,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009424788057923277, - "loss": 0.0127, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3332029.0, "repeat_count": 0.0, - "routers_loss": 0.007599714212119579, + "routers_loss": 0.00783882662653923, "skip_count": 0.0, "step": 2066, "text_loss": 0.22887596487998962 @@ -19644,13 +19644,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07470703125, + "grad_norm": 0.0712890625, "learning_rate": 0.0009423345879746376, - "loss": 0.0126, + "loss": 0.0128, "macro_f1": 0.5492662787437439, "num_tokens": 3334858.0, "repeat_count": 0.0, - "routers_loss": 0.016804348677396774, + "routers_loss": 0.01866884157061577, "skip_count": 2.0, "step": 2068, "text_loss": 0.17724967002868652 @@ -19663,13 +19663,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.06591796875, "learning_rate": 0.000942190200653499, - "loss": 0.0164, + "loss": 0.0162, "macro_f1": 0.32098764181137085, "num_tokens": 3338094.0, "repeat_count": 0.0, - "routers_loss": 0.02686731517314911, + "routers_loss": 0.028636593371629715, "skip_count": 2.0, "step": 2070, "text_loss": 0.34344956278800964 @@ -19682,13 +19682,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009420456438842413, - "loss": 0.0172, + "loss": 0.0165, "macro_f1": 0.5492662787437439, "num_tokens": 3340526.0, "repeat_count": 0.0, - "routers_loss": 0.025320913642644882, + "routers_loss": 0.023245645686984062, "skip_count": 2.0, "step": 2072, "text_loss": 0.7276164293289185 @@ -19701,13 +19701,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11328125, "learning_rate": 0.000941900917722259, - "loss": 0.0145, + "loss": 0.0143, "macro_f1": 0.3272727429866791, "num_tokens": 3343303.0, "repeat_count": 1.0, - "routers_loss": 0.014900023117661476, + "routers_loss": 0.01565689593553543, "skip_count": 0.0, "step": 2074, "text_loss": 0.5665070414543152 @@ -19720,13 +19720,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.1201171875, "learning_rate": 0.0009417560222230115, - "loss": 0.0244, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 3346409.0, "repeat_count": 0.0, - "routers_loss": 0.003426895011216402, + "routers_loss": 0.0035056080669164658, "skip_count": 0.0, "step": 2076, "text_loss": 0.5112795233726501 @@ -19739,13 +19739,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.06982421875, "learning_rate": 0.0009416109574420229, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3349220.0, "repeat_count": 0.0, - "routers_loss": 0.0031935563310980797, + "routers_loss": 0.0027565446216613054, "skip_count": 0.0, "step": 2078, "text_loss": 0.5240910053253174 @@ -19758,13 +19758,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.08203125, "learning_rate": 0.0009414657234348823, - "loss": 0.0183, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 3352627.0, "repeat_count": 3.0, - "routers_loss": 0.016454946249723434, + "routers_loss": 0.01652451977133751, "skip_count": 2.0, "step": 2080, "text_loss": 1.0217112302780151 @@ -19777,13 +19777,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009413203202572438, - "loss": 0.0174, + "loss": 0.0179, "macro_f1": 0.32098764181137085, "num_tokens": 3355392.0, "repeat_count": 0.0, - "routers_loss": 0.1056143268942833, + "routers_loss": 0.1012420505285263, "skip_count": 2.0, "step": 2082, "text_loss": 0.4085482358932495 @@ -19796,13 +19796,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.08251953125, "learning_rate": 0.000941174747964826, - "loss": 0.016, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3358425.0, "repeat_count": 0.0, - "routers_loss": 0.003626141929998994, + "routers_loss": 0.004962718114256859, "skip_count": 0.0, "step": 2084, "text_loss": 0.5833504796028137 @@ -19810,18 +19810,18 @@ { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.793660111535075, - "f1_execute": 0.936170220375061, + "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.107421875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, "learning_rate": 0.0009410290066134124, - "loss": 0.0216, - "macro_f1": 0.7565011978149414, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, "num_tokens": 3361925.0, "repeat_count": 2.0, - "routers_loss": 0.08091846853494644, + "routers_loss": 0.07889176905155182, "skip_count": 3.0, "step": 2086, "text_loss": 0.38126569986343384 @@ -19834,13 +19834,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.051513671875, "learning_rate": 0.0009408830962588517, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6601307392120361, "num_tokens": 3365963.0, "repeat_count": 1.0, - "routers_loss": 0.035208042711019516, + "routers_loss": 0.033715736120939255, "skip_count": 2.0, "step": 2088, "text_loss": 0.23213914036750793 @@ -19853,13 +19853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009407370169570567, - "loss": 0.0167, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3369422.0, "repeat_count": 0.0, - "routers_loss": 0.0018934847321361303, + "routers_loss": 0.0014188943896442652, "skip_count": 0.0, "step": 2090, "text_loss": 0.4648318886756897 @@ -19872,13 +19872,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009405907687640054, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 3372506.0, "repeat_count": 0.0, - "routers_loss": 0.016075141727924347, + "routers_loss": 0.015339684672653675, "skip_count": 1.0, "step": 2092, "text_loss": 0.2563800811767578 @@ -19891,13 +19891,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.054443359375, "learning_rate": 0.0009404443517357404, "loss": 0.0146, "macro_f1": 0.542222261428833, "num_tokens": 3375653.0, "repeat_count": 4.0, - "routers_loss": 0.06333976984024048, + "routers_loss": 0.06562861055135727, "skip_count": 0.0, "step": 2094, "text_loss": 0.797835111618042 @@ -19910,13 +19910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.062255859375, "learning_rate": 0.000940297765928369, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3379018.0, "repeat_count": 0.0, - "routers_loss": 0.005521406419575214, + "routers_loss": 0.005745889153331518, "skip_count": 0.0, "step": 2096, "text_loss": 0.4238114655017853 @@ -19929,13 +19929,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009401510113980631, - "loss": 0.0205, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3382855.0, "repeat_count": 0.0, - "routers_loss": 0.0025159218348562717, + "routers_loss": 0.0026634482201188803, "skip_count": 0.0, "step": 2098, "text_loss": 0.4967166483402252 @@ -19948,13 +19948,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009400040882010592, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 3386386.0, "repeat_count": 0.0, - "routers_loss": 0.0025535966269671917, + "routers_loss": 0.0020642587915062904, "skip_count": 0.0, "step": 2100, "text_loss": 0.44390562176704407 @@ -19967,13 +19967,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056640625, "learning_rate": 0.0009398569963936589, - "loss": 0.0178, + "loss": 0.017, "macro_f1": 0.3272727429866791, "num_tokens": 3389958.0, "repeat_count": 0.0, - "routers_loss": 0.013569516129791737, + "routers_loss": 0.013722737319767475, "skip_count": 1.0, "step": 2102, "text_loss": 0.7207565903663635 @@ -19986,13 +19986,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009397097360322276, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3392892.0, "repeat_count": 0.0, - "routers_loss": 0.0044935219921171665, + "routers_loss": 0.002051608171314001, "skip_count": 0.0, "step": 2104, "text_loss": 0.3196398913860321 @@ -20005,13 +20005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07470703125, "learning_rate": 0.000939562307173196, - "loss": 0.0223, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 3396636.0, "repeat_count": 0.0, - "routers_loss": 0.007407462690025568, + "routers_loss": 0.007085663266479969, "skip_count": 0.0, "step": 2106, "text_loss": 0.5663776397705078 @@ -20024,13 +20024,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.13671875, + "grad_norm": 0.11328125, "learning_rate": 0.0009394147098730592, - "loss": 0.0205, + "loss": 0.02, "macro_f1": 0.5492662787437439, "num_tokens": 3399475.0, "repeat_count": 0.0, - "routers_loss": 0.024386432021856308, + "routers_loss": 0.019473131746053696, "skip_count": 2.0, "step": 2108, "text_loss": 0.7708223462104797 @@ -20043,32 +20043,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.038818359375, "learning_rate": 0.0009392669441883767, - "loss": 0.0135, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3402350.0, "repeat_count": 0.0, - "routers_loss": 0.002929724520072341, + "routers_loss": 0.0028328890912234783, "skip_count": 0.0, "step": 2110, "text_loss": 0.5888006091117859 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.915761667155856, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1201171875, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, "learning_rate": 0.0009391190101757724, - "loss": 0.0168, - "macro_f1": 0.5492662787437439, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, "num_tokens": 3405561.0, "repeat_count": 0.0, - "routers_loss": 0.026861928403377533, + "routers_loss": 0.023098422214388847, "skip_count": 2.0, "step": 2112, "text_loss": 0.09865197539329529 @@ -20081,13 +20081,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.10107421875, "learning_rate": 0.000938970907891935, - "loss": 0.0251, + "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 3408513.0, "repeat_count": 0.0, - "routers_loss": 0.0025369988288730383, + "routers_loss": 0.002896632067859173, "skip_count": 0.0, "step": 2114, "text_loss": 0.6613234281539917 @@ -20100,51 +20100,51 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.0947265625, "learning_rate": 0.0009388226373936179, - "loss": 0.0209, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 3411195.0, "repeat_count": 0.0, - "routers_loss": 0.014292459934949875, + "routers_loss": 0.015814457088708878, "skip_count": 0.0, "step": 2116, "text_loss": 0.17363053560256958 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 9.94393894922219, - "f1_execute": 0.9629629850387573, - "f1_repeat": 0.0, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009386741987376381, - "loss": 0.0151, - "macro_f1": 0.32098767161369324, + "loss": 0.015, + "macro_f1": 0.6603773832321167, "num_tokens": 3414875.0, "repeat_count": 1.0, - "routers_loss": 0.027571436017751694, + "routers_loss": 0.02676783688366413, "skip_count": 0.0, "step": 2118, "text_loss": 0.674056887626648 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.953331376577633, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, "learning_rate": 0.0009385255919808778, - "loss": 0.0205, - "macro_f1": 0.3272727429866791, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, "num_tokens": 3418410.0, "repeat_count": 0.0, - "routers_loss": 0.011719600297510624, + "routers_loss": 0.01022857241332531, "skip_count": 1.0, "step": 2120, "text_loss": 0.235092431306839 @@ -20157,13 +20157,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.0888671875, "learning_rate": 0.0009383768171802836, - "loss": 0.0249, + "loss": 0.0244, "macro_f1": 0.5492662787437439, "num_tokens": 3421289.0, "repeat_count": 0.0, - "routers_loss": 0.01207603607326746, + "routers_loss": 0.013572212308645248, "skip_count": 2.0, "step": 2122, "text_loss": 0.5992844104766846 @@ -20176,13 +20176,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.064453125, "learning_rate": 0.0009382278743928659, - "loss": 0.0206, + "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 3424781.0, "repeat_count": 0.0, - "routers_loss": 0.008004254661500454, + "routers_loss": 0.0051873656921088696, "skip_count": 2.0, "step": 2124, "text_loss": 0.29915499687194824 @@ -20195,13 +20195,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.07666015625, + "grad_norm": 0.07421875, "learning_rate": 0.0009380787636757001, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6122449040412903, "num_tokens": 3427942.0, "repeat_count": 0.0, - "routers_loss": 0.030767880380153656, + "routers_loss": 0.030079292133450508, "skip_count": 4.0, "step": 2126, "text_loss": 0.24181491136550903 @@ -20214,13 +20214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009379294850859256, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3431314.0, "repeat_count": 0.0, - "routers_loss": 0.002620625076815486, + "routers_loss": 0.002675612922757864, "skip_count": 0.0, "step": 2128, "text_loss": 0.4669873118400574 @@ -20233,13 +20233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009377800386807465, - "loss": 0.0175, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 3435020.0, "repeat_count": 0.0, - "routers_loss": 0.009095560759305954, + "routers_loss": 0.009334275498986244, "skip_count": 0.0, "step": 2130, "text_loss": 0.6478219628334045 @@ -20252,13 +20252,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.11865234375, + "grad_norm": 0.134765625, "learning_rate": 0.0009376304245174306, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.6000000238418579, "num_tokens": 3438276.0, "repeat_count": 1.0, - "routers_loss": 0.058448426425457, + "routers_loss": 0.038227908313274384, "skip_count": 2.0, "step": 2132, "text_loss": 0.4401201903820038 @@ -20271,13 +20271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.041748046875, "learning_rate": 0.0009374806426533104, - "loss": 0.0116, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3440938.0, "repeat_count": 0.0, - "routers_loss": 0.007323687430471182, + "routers_loss": 0.006901399698108435, "skip_count": 0.0, "step": 2134, "text_loss": 0.5948942303657532 @@ -20290,13 +20290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.051025390625, "learning_rate": 0.0009373306931457827, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3444028.0, "repeat_count": 0.0, - "routers_loss": 0.003302243771031499, + "routers_loss": 0.0037061909679323435, "skip_count": 0.0, "step": 2136, "text_loss": 0.5349751114845276 @@ -20309,13 +20309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.056884765625, "learning_rate": 0.0009371805760523086, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 3448331.0, "repeat_count": 0.0, - "routers_loss": 0.0027974818367511034, + "routers_loss": 0.0025877030566334724, "skip_count": 0.0, "step": 2138, "text_loss": 0.4591051936149597 @@ -20328,13 +20328,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.08642578125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009370302914304129, - "loss": 0.0145, + "loss": 0.0144, "macro_f1": 0.5934640765190125, "num_tokens": 3451434.0, "repeat_count": 0.0, - "routers_loss": 0.01572767272591591, + "routers_loss": 0.018742674961686134, "skip_count": 3.0, "step": 2140, "text_loss": 0.23470863699913025 @@ -20347,13 +20347,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009368798393376851, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 3454375.0, "repeat_count": 0.0, - "routers_loss": 0.020721890032291412, + "routers_loss": 0.02382594160735607, "skip_count": 1.0, "step": 2142, "text_loss": 0.6077954769134521 @@ -20366,13 +20366,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05859375, + "grad_norm": 0.05517578125, "learning_rate": 0.0009367292198317787, - "loss": 0.0161, + "loss": 0.0164, "macro_f1": 0.5492662787437439, "num_tokens": 3457591.0, "repeat_count": 0.0, - "routers_loss": 0.03272393345832825, + "routers_loss": 0.03331060707569122, "skip_count": 2.0, "step": 2144, "text_loss": 0.3691073954105377 @@ -20385,13 +20385,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009365784329704115, - "loss": 0.0191, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3460895.0, "repeat_count": 0.0, - "routers_loss": 0.0017473002662882209, + "routers_loss": 0.0016955457394942641, "skip_count": 0.0, "step": 2146, "text_loss": 0.3947436511516571 @@ -20404,13 +20404,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.050537109375, "learning_rate": 0.0009364274788113651, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3464101.0, "repeat_count": 1.0, - "routers_loss": 0.008070237934589386, + "routers_loss": 0.006169239990413189, "skip_count": 0.0, "step": 2148, "text_loss": 0.3348555266857147 @@ -20423,13 +20423,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.068359375, "learning_rate": 0.0009362763574124858, - "loss": 0.0191, + "loss": 0.019, "macro_f1": 0.9265305995941162, "num_tokens": 3467417.0, "repeat_count": 3.0, - "routers_loss": 0.021709222346544266, + "routers_loss": 0.024033790454268456, "skip_count": 1.0, "step": 2150, "text_loss": 0.496633380651474 @@ -20442,13 +20442,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.042724609375, "learning_rate": 0.0009361250688316829, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3470917.0, "repeat_count": 0.0, - "routers_loss": 0.0022237664088606834, + "routers_loss": 0.0024986129719763994, "skip_count": 0.0, "step": 2152, "text_loss": 0.6857671737670898 @@ -20461,13 +20461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0546875, "learning_rate": 0.0009359736131269312, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3473624.0, "repeat_count": 0.0, - "routers_loss": 0.00838750321418047, + "routers_loss": 0.008183322846889496, "skip_count": 1.0, "step": 2154, "text_loss": 0.13883116841316223 @@ -20480,13 +20480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.06640625, "learning_rate": 0.0009358219903562684, - "loss": 0.01, + "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3476472.0, "repeat_count": 0.0, - "routers_loss": 0.010190514847636223, + "routers_loss": 0.011198793537914753, "skip_count": 3.0, "step": 2156, "text_loss": 0.24243666231632233 @@ -20499,13 +20499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.04296875, "learning_rate": 0.0009356702005777969, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3479688.0, "repeat_count": 0.0, - "routers_loss": 0.002411153633147478, + "routers_loss": 0.002520184963941574, "skip_count": 0.0, "step": 2158, "text_loss": 0.6407818794250488 @@ -20518,13 +20518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009355182438496825, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3482598.0, "repeat_count": 0.0, - "routers_loss": 0.001032356172800064, + "routers_loss": 0.0011065017897635698, "skip_count": 0.0, "step": 2160, "text_loss": 0.7214245796203613 @@ -20537,13 +20537,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009353661202301557, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 3486271.0, "repeat_count": 0.0, - "routers_loss": 0.0022046815138310194, + "routers_loss": 0.0017824085662141442, "skip_count": 0.0, "step": 2162, "text_loss": 0.5140969157218933 @@ -20556,32 +20556,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.053466796875, "learning_rate": 0.0009352138297775101, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3489206.0, "repeat_count": 0.0, - "routers_loss": 0.0014977266546338797, + "routers_loss": 0.001542879967018962, "skip_count": 0.0, "step": 2164, "text_loss": 0.7956416606903076 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 1.0, + "avg_layers": 25.0, "epoch": 10.169063692398003, - "f1_execute": 0.9803921580314636, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, + "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000935061372550104, - "loss": 0.0132, - "macro_f1": 0.5934640765190125, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, "num_tokens": 3492003.0, "repeat_count": 0.0, - "routers_loss": 0.016847684979438782, + "routers_loss": 0.01420794241130352, "skip_count": 3.0, "step": 2166, "text_loss": 0.27489882707595825 @@ -20594,13 +20594,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009349087486063594, - "loss": 0.0168, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3494784.0, "repeat_count": 0.0, - "routers_loss": 0.0036806222051382065, + "routers_loss": 0.003614309709519148, "skip_count": 1.0, "step": 2168, "text_loss": 0.2962227761745453 @@ -20613,13 +20613,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1259765625, "learning_rate": 0.0009347559580047618, - "loss": 0.0174, + "loss": 0.0175, "macro_f1": 0.8814815282821655, "num_tokens": 3497886.0, "repeat_count": 2.0, - "routers_loss": 0.021412594243884087, + "routers_loss": 0.02122853323817253, "skip_count": 4.0, "step": 2170, "text_loss": 0.5919580459594727 @@ -20627,18 +20627,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 10.197240974464338, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, "learning_rate": 0.000934603000803861, - "loss": 0.0134, - "macro_f1": 0.6666666865348816, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, "num_tokens": 3500939.0, "repeat_count": 0.0, - "routers_loss": 0.0201424453407526, + "routers_loss": 0.02042219042778015, "skip_count": 1.0, "step": 2172, "text_loss": 0.28722381591796875 @@ -20651,13 +20651,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009344498770622704, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3504852.0, "repeat_count": 0.0, - "routers_loss": 0.005059401970356703, + "routers_loss": 0.004345106892287731, "skip_count": 0.0, "step": 2174, "text_loss": 0.603236734867096 @@ -20670,13 +20670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1064453125, "learning_rate": 0.0009342965868386673, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3508320.0, "repeat_count": 0.0, - "routers_loss": 0.004006600938737392, + "routers_loss": 0.00368050136603415, "skip_count": 0.0, "step": 2176, "text_loss": 0.6020491719245911 @@ -20691,11 +20691,11 @@ "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000934143130191793, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 3511278.0, "repeat_count": 0.0, - "routers_loss": 0.013246738351881504, + "routers_loss": 0.013425769284367561, "skip_count": 0.0, "step": 2178, "text_loss": 0.5954724550247192 @@ -20708,13 +20708,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.060546875, "learning_rate": 0.000933989507180452, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 3514361.0, "repeat_count": 0.0, - "routers_loss": 0.0031937146559357643, + "routers_loss": 0.002896249992772937, "skip_count": 0.0, "step": 2180, "text_loss": 0.39175131916999817 @@ -20727,13 +20727,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0556640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009338357178635135, - "loss": 0.0151, + "loss": 0.0147, "macro_f1": 0.6603773832321167, "num_tokens": 3517962.0, "repeat_count": 1.0, - "routers_loss": 0.014782631769776344, + "routers_loss": 0.011538350023329258, "skip_count": 1.0, "step": 2182, "text_loss": 0.4482830762863159 @@ -20746,13 +20746,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0009336817622999093, - "loss": 0.0112, + "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 3521299.0, "repeat_count": 1.0, - "routers_loss": 0.02318345196545124, + "routers_loss": 0.022787930443882942, "skip_count": 0.0, "step": 2184, "text_loss": 0.35177817940711975 @@ -20765,13 +20765,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009335276405486357, - "loss": 0.0134, + "loss": 0.0139, "macro_f1": 0.3272727429866791, "num_tokens": 3524611.0, "repeat_count": 0.0, - "routers_loss": 0.011735675856471062, + "routers_loss": 0.011597735807299614, "skip_count": 1.0, "step": 2186, "text_loss": 0.24868851900100708 @@ -20784,13 +20784,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009333733526687524, - "loss": 0.0198, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 3528012.0, "repeat_count": 0.0, - "routers_loss": 0.01558679062873125, + "routers_loss": 0.014253967441618443, "skip_count": 0.0, "step": 2188, "text_loss": 0.3970910310745239 @@ -20803,13 +20803,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.054931640625, "learning_rate": 0.000933218898719383, - "loss": 0.0163, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3530908.0, "repeat_count": 0.0, - "routers_loss": 0.0019149131840094924, + "routers_loss": 0.001659149187617004, "skip_count": 0.0, "step": 2190, "text_loss": 0.7618573307991028 @@ -20822,13 +20822,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0693359375, "learning_rate": 0.0009330642787597141, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3533993.0, "repeat_count": 0.0, - "routers_loss": 0.0056966920383274555, + "routers_loss": 0.005574346985667944, "skip_count": 0.0, "step": 2192, "text_loss": 0.16470147669315338 @@ -20841,13 +20841,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009329094928489969, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3537310.0, "repeat_count": 0.0, - "routers_loss": 0.002511024009436369, + "routers_loss": 0.0026400673668831587, "skip_count": 0.0, "step": 2194, "text_loss": 0.3400416374206543 @@ -20860,13 +20860,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08935546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009327545410465452, - "loss": 0.0126, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3540045.0, "repeat_count": 0.0, - "routers_loss": 0.008584192954003811, + "routers_loss": 0.008448398672044277, "skip_count": 3.0, "step": 2196, "text_loss": 0.3110542297363281 @@ -20879,13 +20879,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.04638671875, "learning_rate": 0.0009325994234117372, - "loss": 0.0129, + "loss": 0.0122, "macro_f1": 0.32098764181137085, "num_tokens": 3544097.0, "repeat_count": 0.0, - "routers_loss": 0.03748156875371933, + "routers_loss": 0.037553198635578156, "skip_count": 2.0, "step": 2198, "text_loss": 0.36126700043678284 @@ -20898,13 +20898,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.09716796875, "learning_rate": 0.000932444140004014, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3547054.0, "repeat_count": 1.0, - "routers_loss": 0.006402099970728159, + "routers_loss": 0.006464479025453329, "skip_count": 0.0, "step": 2200, "text_loss": 0.4947047233581543 @@ -20917,13 +20917,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1015625, "learning_rate": 0.0009322886908828805, - "loss": 0.015, + "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3549903.0, "repeat_count": 1.0, - "routers_loss": 0.0055928584188222885, + "routers_loss": 0.005384812597185373, "skip_count": 0.0, "step": 2202, "text_loss": 0.5923738479614258 @@ -20936,13 +20936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009321330761079052, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3553745.0, "repeat_count": 0.0, - "routers_loss": 0.013155708089470863, + "routers_loss": 0.015346619300544262, "skip_count": 2.0, "step": 2204, "text_loss": 0.1904175877571106 @@ -20955,13 +20955,13 @@ "f1_execute": 0.9268292784690857, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.06884765625, + "grad_norm": 0.06494140625, "learning_rate": 0.00093197729573872, - "loss": 0.0206, + "loss": 0.0203, "macro_f1": 0.8422764539718628, "num_tokens": 3557235.0, "repeat_count": 3.0, - "routers_loss": 0.12029488384723663, + "routers_loss": 0.1207597479224205, "skip_count": 6.0, "step": 2206, "text_loss": 0.3904837667942047 @@ -20974,13 +20974,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0771484375, "learning_rate": 0.0009318213498350202, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3560795.0, "repeat_count": 0.0, - "routers_loss": 0.0037007431965321302, + "routers_loss": 0.003334777895361185, "skip_count": 0.0, "step": 2208, "text_loss": 0.4268290102481842 @@ -20993,13 +20993,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.0537109375, "learning_rate": 0.0009316652384565645, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3563754.0, "repeat_count": 0.0, - "routers_loss": 0.004071404226124287, + "routers_loss": 0.004230072256177664, "skip_count": 0.0, "step": 2210, "text_loss": 0.40049710869789124 @@ -21012,13 +21012,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046875, "learning_rate": 0.0009315089616631751, - "loss": 0.0103, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 3567173.0, "repeat_count": 0.0, - "routers_loss": 0.0006955390563234687, + "routers_loss": 0.0006645230459980667, "skip_count": 0.0, "step": 2212, "text_loss": 0.42568323016166687 @@ -21031,32 +21031,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.07470703125, "learning_rate": 0.0009313525195147376, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3570831.0, "repeat_count": 0.0, - "routers_loss": 0.010293997824192047, + "routers_loss": 0.0097877848893404, "skip_count": 0.0, "step": 2214, "text_loss": 0.45808279514312744 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 10.40387437628412, - "f1_execute": 0.9583333134651184, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07470703125, + "grad_norm": 0.076171875, "learning_rate": 0.000931195912071201, - "loss": 0.0185, - "macro_f1": 0.8194444179534912, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, "num_tokens": 3573745.0, "repeat_count": 2.0, - "routers_loss": 0.06593773514032364, + "routers_loss": 0.07351134717464447, "skip_count": 3.0, "step": 2216, "text_loss": 0.285696804523468 @@ -21069,13 +21069,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009310391393925775, - "loss": 0.013, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3576785.0, "repeat_count": 0.0, - "routers_loss": 0.00347105972468853, + "routers_loss": 0.0033160944003611803, "skip_count": 0.0, "step": 2218, "text_loss": 0.17516443133354187 @@ -21088,32 +21088,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04736328125, + "grad_norm": 0.047119140625, "learning_rate": 0.0009308822015389424, - "loss": 0.0244, + "loss": 0.0241, "macro_f1": 0.5427350401878357, "num_tokens": 3580695.0, "repeat_count": 1.0, - "routers_loss": 0.04871147498488426, + "routers_loss": 0.052930232137441635, "skip_count": 1.0, "step": 2220, "text_loss": 0.5918155908584595 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.75, + "avg_layers": 25.0, "epoch": 10.432051658350455, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.05517578125, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, "learning_rate": 0.0009307250985704352, - "loss": 0.012, - "macro_f1": 0.542222261428833, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, "num_tokens": 3583729.0, "repeat_count": 0.0, - "routers_loss": 0.024859672412276268, + "routers_loss": 0.025454653427004814, "skip_count": 4.0, "step": 2222, "text_loss": 0.2652169466018677 @@ -21126,13 +21126,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.052001953125, "learning_rate": 0.0009305678305472575, - "loss": 0.016, + "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 3586775.0, "repeat_count": 0.0, - "routers_loss": 0.010990055277943611, + "routers_loss": 0.011279845610260963, "skip_count": 0.0, "step": 2224, "text_loss": 0.3511691987514496 @@ -21145,13 +21145,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.10791015625, "learning_rate": 0.000930410397529675, - "loss": 0.0171, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3589676.0, "repeat_count": 0.0, - "routers_loss": 0.0025031559634953737, + "routers_loss": 0.002700264798477292, "skip_count": 0.0, "step": 2226, "text_loss": 0.24045433104038239 @@ -21164,13 +21164,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.048095703125, "learning_rate": 0.000930252799578016, - "loss": 0.0147, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 3593242.0, "repeat_count": 1.0, - "routers_loss": 0.008100497536361217, + "routers_loss": 0.00826631672680378, "skip_count": 2.0, "step": 2228, "text_loss": 0.3777645528316498 @@ -21183,13 +21183,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009300950367526728, - "loss": 0.0128, + "loss": 0.0131, "macro_f1": 0.8820862174034119, "num_tokens": 3596807.0, "repeat_count": 2.0, - "routers_loss": 0.03150207921862602, + "routers_loss": 0.036221496760845184, "skip_count": 2.0, "step": 2230, "text_loss": 0.502962589263916 @@ -21202,13 +21202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.0703125, "learning_rate": 0.0009299371091141001, - "loss": 0.0132, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3600150.0, "repeat_count": 0.0, - "routers_loss": 0.006253884173929691, + "routers_loss": 0.006449893582612276, "skip_count": 0.0, "step": 2232, "text_loss": 0.20256924629211426 @@ -21221,13 +21221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.04638671875, "learning_rate": 0.0009297790167228161, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3602988.0, "repeat_count": 0.0, - "routers_loss": 0.007228068076074123, + "routers_loss": 0.007872486487030983, "skip_count": 2.0, "step": 2234, "text_loss": 0.42476826906204224 @@ -21240,13 +21240,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009296207596394022, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 0.32098764181137085, "num_tokens": 3606071.0, "repeat_count": 0.0, - "routers_loss": 0.02524643763899803, + "routers_loss": 0.027397040277719498, "skip_count": 2.0, "step": 2236, "text_loss": 0.23432791233062744 @@ -21259,13 +21259,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009294623379245028, - "loss": 0.0119, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3609389.0, "repeat_count": 0.0, - "routers_loss": 0.009672109968960285, + "routers_loss": 0.01042645052075386, "skip_count": 0.0, "step": 2238, "text_loss": 0.16665785014629364 @@ -21278,13 +21278,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009293037516388252, - "loss": 0.0155, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3612105.0, "repeat_count": 0.0, - "routers_loss": 0.0010066524846479297, + "routers_loss": 0.0012458425480872393, "skip_count": 0.0, "step": 2240, "text_loss": 0.59421306848526 @@ -21297,13 +21297,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0751953125, "learning_rate": 0.0009291450008431404, - "loss": 0.0184, + "loss": 0.0185, "macro_f1": 1.0, "num_tokens": 3615439.0, "repeat_count": 1.0, - "routers_loss": 0.005509128328412771, + "routers_loss": 0.005781981628388166, "skip_count": 1.0, "step": 2242, "text_loss": 0.510798454284668 @@ -21316,13 +21316,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.09423828125, + "grad_norm": 0.0966796875, "learning_rate": 0.0009289860855982814, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.4871794879436493, "num_tokens": 3618842.0, "repeat_count": 0.0, - "routers_loss": 0.030802007764577866, + "routers_loss": 0.031195320188999176, "skip_count": 3.0, "step": 2244, "text_loss": 0.7574363350868225 @@ -21335,13 +21335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.04931640625, "learning_rate": 0.0009288270059651454, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 3621823.0, "repeat_count": 0.0, - "routers_loss": 0.001686889911070466, + "routers_loss": 0.001746491645462811, "skip_count": 0.0, "step": 2246, "text_loss": 0.5125683546066284 @@ -21354,13 +21354,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.220703125, "learning_rate": 0.0009286677620046918, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3624502.0, "repeat_count": 0.0, - "routers_loss": 0.03299177065491676, + "routers_loss": 0.03792348504066467, "skip_count": 2.0, "step": 2248, "text_loss": 0.7533677220344543 @@ -21375,11 +21375,11 @@ "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009285083537779429, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3627057.0, "repeat_count": 0.0, - "routers_loss": 0.0010354233672842383, + "routers_loss": 0.0009684451506473124, "skip_count": 0.0, "step": 2250, "text_loss": 0.2219279706478119 @@ -21392,13 +21392,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10205078125, + "grad_norm": 0.11767578125, "learning_rate": 0.0009283487813459845, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.5492662787437439, "num_tokens": 3629720.0, "repeat_count": 0.0, - "routers_loss": 0.02196674607694149, + "routers_loss": 0.022757573053240776, "skip_count": 2.0, "step": 2252, "text_loss": 0.6903313994407654 @@ -21411,13 +21411,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009281890447699652, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 3633234.0, "repeat_count": 1.0, - "routers_loss": 0.002239946974441409, + "routers_loss": 0.003613058477640152, "skip_count": 0.0, "step": 2254, "text_loss": 0.6278893351554871 @@ -21430,13 +21430,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009280291441110961, - "loss": 0.0117, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3636289.0, "repeat_count": 0.0, - "routers_loss": 0.0063575254753232, + "routers_loss": 0.006214062683284283, "skip_count": 0.0, "step": 2256, "text_loss": 0.3011114001274109 @@ -21449,13 +21449,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.040283203125, + "grad_norm": 0.041015625, "learning_rate": 0.0009278690794306517, - "loss": 0.0143, + "loss": 0.014, "macro_f1": 0.5492662787437439, "num_tokens": 3640251.0, "repeat_count": 0.0, - "routers_loss": 0.0524379126727581, + "routers_loss": 0.052556321024894714, "skip_count": 2.0, "step": 2258, "text_loss": 0.19894185662269592 @@ -21468,13 +21468,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.08251953125, "learning_rate": 0.0009277088507899689, - "loss": 0.0156, + "loss": 0.0163, "macro_f1": 0.9452888369560242, "num_tokens": 3643527.0, "repeat_count": 4.0, - "routers_loss": 0.052486274391412735, + "routers_loss": 0.0572301521897316, "skip_count": 1.0, "step": 2260, "text_loss": 0.5593410134315491 @@ -21487,13 +21487,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009275484582504475, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3646959.0, "repeat_count": 0.0, - "routers_loss": 0.006877690553665161, + "routers_loss": 0.008010074496269226, "skip_count": 0.0, "step": 2262, "text_loss": 0.2128177285194397 @@ -21506,13 +21506,13 @@ "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.05322265625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009273879018735505, - "loss": 0.0136, + "loss": 0.0138, "macro_f1": 0.8521739840507507, "num_tokens": 3651298.0, "repeat_count": 3.0, - "routers_loss": 0.03128742054104805, + "routers_loss": 0.035729870200157166, "skip_count": 3.0, "step": 2264, "text_loss": 0.2987811267375946 @@ -21525,13 +21525,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009272271817208031, - "loss": 0.0188, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 3655609.0, "repeat_count": 0.0, - "routers_loss": 0.0028425443451851606, + "routers_loss": 0.002379779238253832, "skip_count": 0.0, "step": 2266, "text_loss": 0.6024088263511658 @@ -21544,13 +21544,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009270662978537939, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3658444.0, "repeat_count": 0.0, - "routers_loss": 0.009712206199765205, + "routers_loss": 0.008943650871515274, "skip_count": 0.0, "step": 2268, "text_loss": 0.1741207242012024 @@ -21563,13 +21563,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.053955078125, "learning_rate": 0.0009269052503341736, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6595745086669922, "num_tokens": 3662282.0, "repeat_count": 1.0, - "routers_loss": 0.03980376198887825, + "routers_loss": 0.030201267451047897, "skip_count": 4.0, "step": 2270, "text_loss": 0.7300035953521729 @@ -21582,13 +21582,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.049072265625, "learning_rate": 0.0009267440392236562, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3665531.0, "repeat_count": 0.0, - "routers_loss": 0.0030603872146457434, + "routers_loss": 0.0026635683607310057, "skip_count": 0.0, "step": 2272, "text_loss": 0.31535038352012634 @@ -21601,13 +21601,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009265826645840178, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 3668407.0, "repeat_count": 0.0, - "routers_loss": 0.004795679822564125, + "routers_loss": 0.004258926957845688, "skip_count": 0.0, "step": 2274, "text_loss": 0.7272579073905945 @@ -21620,13 +21620,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.1435546875, + "grad_norm": 0.125, "learning_rate": 0.0009264211264770976, - "loss": 0.0155, + "loss": 0.0154, "macro_f1": 0.6122449040412903, "num_tokens": 3671503.0, "repeat_count": 0.0, - "routers_loss": 0.0340447798371315, + "routers_loss": 0.038987524807453156, "skip_count": 4.0, "step": 2276, "text_loss": 0.7488982677459717 @@ -21639,13 +21639,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0009262594249647975, - "loss": 0.016, + "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 3674107.0, "repeat_count": 0.0, - "routers_loss": 0.007436402142047882, + "routers_loss": 0.007211760152131319, "skip_count": 1.0, "step": 2278, "text_loss": 0.1992369294166565 @@ -21658,13 +21658,13 @@ "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0546875, "learning_rate": 0.0009260975601090815, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.9446290731430054, "num_tokens": 3677184.0, "repeat_count": 4.0, - "routers_loss": 0.02465176396071911, + "routers_loss": 0.02538592554628849, "skip_count": 3.0, "step": 2280, "text_loss": 0.46402135491371155 @@ -21677,13 +21677,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009259355319719768, - "loss": 0.0167, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3680683.0, "repeat_count": 0.0, - "routers_loss": 0.0037910486571490765, + "routers_loss": 0.0038464947137981653, "skip_count": 0.0, "step": 2282, "text_loss": 0.5804527401924133 @@ -21696,13 +21696,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009257733406155726, - "loss": 0.0161, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3683928.0, "repeat_count": 0.0, - "routers_loss": 0.003716849023476243, + "routers_loss": 0.004841136280447245, "skip_count": 0.0, "step": 2284, "text_loss": 0.4834538400173187 @@ -21715,13 +21715,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009256109861020212, - "loss": 0.0118, + "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3687101.0, "repeat_count": 0.0, - "routers_loss": 0.0021690395660698414, + "routers_loss": 0.002191900508478284, "skip_count": 0.0, "step": 2286, "text_loss": 0.8199604749679565 @@ -21734,13 +21734,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000925448468493537, "loss": 0.0162, "macro_f1": 0.5427350401878357, "num_tokens": 3690490.0, "repeat_count": 1.0, - "routers_loss": 0.034040264785289764, + "routers_loss": 0.03488675877451897, "skip_count": 2.0, "step": 2288, "text_loss": 0.33263635635375977 @@ -21753,32 +21753,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009252857878523971, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3694109.0, "repeat_count": 1.0, - "routers_loss": 0.0027822356205433607, + "routers_loss": 0.002897309372201562, "skip_count": 0.0, "step": 2290, "text_loss": 0.47494807839393616 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 10.760786615791018, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, "learning_rate": 0.000925122944240941, - "loss": 0.0156, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3697233.0, "repeat_count": 0.0, - "routers_loss": 0.020813947543501854, + "routers_loss": 0.01842675730586052, "skip_count": 2.0, "step": 2292, "text_loss": 0.14693495631217957 @@ -21791,13 +21791,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.042236328125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009249599377215707, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 0.5866667032241821, "num_tokens": 3700376.0, "repeat_count": 1.0, - "routers_loss": 0.038725610822439194, + "routers_loss": 0.04169808700680733, "skip_count": 3.0, "step": 2294, "text_loss": 0.38051268458366394 @@ -21810,13 +21810,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.05908203125, "learning_rate": 0.0009247967683567507, - "loss": 0.0117, + "loss": 0.0112, "macro_f1": 0.3272727429866791, "num_tokens": 3703212.0, "repeat_count": 0.0, - "routers_loss": 0.01360203418880701, + "routers_loss": 0.012183113023638725, "skip_count": 1.0, "step": 2296, "text_loss": 0.23789077997207642 @@ -21829,13 +21829,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0498046875, + "grad_norm": 0.05712890625, "learning_rate": 0.0009246334362090077, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3706490.0, "repeat_count": 1.0, - "routers_loss": 0.021909991279244423, + "routers_loss": 0.01880069635808468, "skip_count": 2.0, "step": 2298, "text_loss": 0.29067978262901306 @@ -21848,13 +21848,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.000924469941340931, - "loss": 0.0175, + "loss": 0.0173, "macro_f1": 0.3272727429866791, "num_tokens": 3709804.0, "repeat_count": 1.0, - "routers_loss": 0.03153124824166298, + "routers_loss": 0.027359159663319588, "skip_count": 0.0, "step": 2300, "text_loss": 0.67828369140625 @@ -21867,13 +21867,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.07275390625, "learning_rate": 0.000924306283815172, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.3333333432674408, "num_tokens": 3712824.0, "repeat_count": 0.0, - "routers_loss": 0.0034419491421431303, + "routers_loss": 0.003152279881760478, "skip_count": 0.0, "step": 2302, "text_loss": 0.8333184719085693 @@ -21886,13 +21886,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.061767578125, + "grad_norm": 0.0703125, "learning_rate": 0.0009241424636944445, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3715385.0, "repeat_count": 0.0, - "routers_loss": 0.03655214607715607, + "routers_loss": 0.0442950464785099, "skip_count": 2.0, "step": 2304, "text_loss": 0.41893699765205383 @@ -21905,13 +21905,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0576171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009239784810415249, - "loss": 0.014, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3719080.0, "repeat_count": 1.0, - "routers_loss": 0.015360959805548191, + "routers_loss": 0.015729321166872978, "skip_count": 2.0, "step": 2306, "text_loss": 0.13360483944416046 @@ -21924,13 +21924,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0537109375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009238143359192514, "loss": 0.0136, "macro_f1": 0.5934640765190125, "num_tokens": 3722439.0, "repeat_count": 0.0, - "routers_loss": 0.027275927364826202, + "routers_loss": 0.028816604986786842, "skip_count": 3.0, "step": 2308, "text_loss": 0.39594101905822754 @@ -21943,13 +21943,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0546875, + "grad_norm": 0.05419921875, "learning_rate": 0.000923650028390525, - "loss": 0.0163, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3725092.0, "repeat_count": 0.0, - "routers_loss": 0.003742894157767296, + "routers_loss": 0.0036455015651881695, "skip_count": 2.0, "step": 2310, "text_loss": 0.6169708371162415 @@ -21962,13 +21962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009234855585183086, - "loss": 0.0135, + "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3728412.0, "repeat_count": 0.0, - "routers_loss": 0.009356650523841381, + "routers_loss": 0.007565604057163, "skip_count": 1.0, "step": 2312, "text_loss": 0.21257059276103973 @@ -21983,11 +21983,11 @@ "f1_skip": 0.800000011920929, "grad_norm": 0.0517578125, "learning_rate": 0.0009233209263656273, - "loss": 0.0189, + "loss": 0.0184, "macro_f1": 0.9262410998344421, "num_tokens": 3731467.0, "repeat_count": 2.0, - "routers_loss": 0.02852487564086914, + "routers_loss": 0.02510629966855049, "skip_count": 3.0, "step": 2314, "text_loss": 0.21639840304851532 @@ -22000,13 +22000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05859375, + "grad_norm": 0.057861328125, "learning_rate": 0.0009231561319955684, - "loss": 0.0151, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3734906.0, "repeat_count": 0.0, - "routers_loss": 0.007533316500484943, + "routers_loss": 0.00872227642685175, "skip_count": 0.0, "step": 2316, "text_loss": 0.35639774799346924 @@ -22019,13 +22019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.08349609375, "learning_rate": 0.0009229911754712815, "loss": 0.0176, "macro_f1": 0.3333333432674408, "num_tokens": 3737943.0, "repeat_count": 0.0, - "routers_loss": 0.004666361026465893, + "routers_loss": 0.004695790819823742, "skip_count": 0.0, "step": 2318, "text_loss": 0.5269573330879211 @@ -22038,32 +22038,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.036376953125, "learning_rate": 0.0009228260568559781, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 3741833.0, "repeat_count": 1.0, - "routers_loss": 0.020992714911699295, + "routers_loss": 0.0217357836663723, "skip_count": 0.0, "step": 2320, "text_loss": 0.5110208988189697 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 10.901673026122689, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "f1_skip": 0.0, + "grad_norm": 0.1953125, "learning_rate": 0.0009226607762129322, - "loss": 0.0204, - "macro_f1": 0.6603773832321167, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, "num_tokens": 3744642.0, "repeat_count": 1.0, - "routers_loss": 0.047016773372888565, + "routers_loss": 0.05595960095524788, "skip_count": 1.0, "step": 2322, "text_loss": 0.6291998624801636 @@ -22078,11 +22078,11 @@ "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009224953336054796, - "loss": 0.0156, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3748127.0, "repeat_count": 0.0, - "routers_loss": 0.006612313445657492, + "routers_loss": 0.0071634589694440365, "skip_count": 0.0, "step": 2324, "text_loss": 0.7404762506484985 @@ -22095,13 +22095,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.050537109375, "learning_rate": 0.000922329729097018, - "loss": 0.0164, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3751373.0, "repeat_count": 0.0, - "routers_loss": 0.0012452995870262384, + "routers_loss": 0.0011676300782710314, "skip_count": 0.0, "step": 2326, "text_loss": 0.2915459871292114 @@ -22114,13 +22114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.061279296875, "learning_rate": 0.0009221639627510075, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3754518.0, "repeat_count": 0.0, - "routers_loss": 0.011379311792552471, + "routers_loss": 0.01039792038500309, "skip_count": 0.0, "step": 2328, "text_loss": 0.22066321969032288 @@ -22133,13 +22133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0751953125, "learning_rate": 0.0009219980346309702, - "loss": 0.0127, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3757621.0, "repeat_count": 0.0, - "routers_loss": 0.002973968628793955, + "routers_loss": 0.0032070958986878395, "skip_count": 0.0, "step": 2330, "text_loss": 0.5558560490608215 @@ -22152,13 +22152,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.076171875, "learning_rate": 0.0009218319448004899, - "loss": 0.012, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3760885.0, "repeat_count": 0.0, - "routers_loss": 0.00768645154312253, + "routers_loss": 0.007085457909852266, "skip_count": 0.0, "step": 2332, "text_loss": 0.4348253607749939 @@ -22171,13 +22171,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009216656933232129, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 3764462.0, "repeat_count": 0.0, - "routers_loss": 0.006761785596609116, + "routers_loss": 0.005504854489117861, "skip_count": 1.0, "step": 2334, "text_loss": 0.35828644037246704 @@ -22190,13 +22190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.05615234375, "learning_rate": 0.0009214992802628463, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3767159.0, "repeat_count": 0.0, - "routers_loss": 0.0013711688807234168, + "routers_loss": 0.0013970810687169433, "skip_count": 0.0, "step": 2336, "text_loss": 0.2956557869911194 @@ -22209,13 +22209,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08203125, "learning_rate": 0.0009213327056831607, - "loss": 0.0174, + "loss": 0.0181, "macro_f1": 0.3272727429866791, "num_tokens": 3770408.0, "repeat_count": 0.0, - "routers_loss": 0.04009406641125679, + "routers_loss": 0.0427570566534996, "skip_count": 1.0, "step": 2338, "text_loss": 0.14883014559745789 @@ -22228,13 +22228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.041015625, "learning_rate": 0.0009211659696479875, - "loss": 0.0095, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3773474.0, "repeat_count": 0.0, - "routers_loss": 0.0013272224459797144, + "routers_loss": 0.0011273405980318785, "skip_count": 0.0, "step": 2340, "text_loss": 0.26011669635772705 @@ -22249,11 +22249,11 @@ "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.00092099907222122, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3776909.0, "repeat_count": 0.0, - "routers_loss": 0.001724833040498197, + "routers_loss": 0.0016178421210497618, "skip_count": 0.0, "step": 2342, "text_loss": 0.49078530073165894 @@ -22266,13 +22266,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.051025390625, "learning_rate": 0.000920832013466814, - "loss": 0.0132, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 3780741.0, "repeat_count": 0.0, - "routers_loss": 0.005641496740281582, + "routers_loss": 0.005510095041245222, "skip_count": 0.0, "step": 2344, "text_loss": 0.4870249927043915 @@ -22285,13 +22285,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.037109375, "learning_rate": 0.0009206647934487866, - "loss": 0.011, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3784673.0, "repeat_count": 1.0, - "routers_loss": 0.003907595761120319, + "routers_loss": 0.0047357892617583275, "skip_count": 0.0, "step": 2346, "text_loss": 0.3251725733280182 @@ -22304,13 +22304,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.05615234375, "learning_rate": 0.0009204974122312167, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3787503.0, "repeat_count": 0.0, - "routers_loss": 0.007570050656795502, + "routers_loss": 0.00795028731226921, "skip_count": 1.0, "step": 2348, "text_loss": 0.18282145261764526 @@ -22323,13 +22323,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.060546875, "learning_rate": 0.0009203298698782452, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3790528.0, "repeat_count": 1.0, - "routers_loss": 0.0009280897793360054, + "routers_loss": 0.0009506374481134117, "skip_count": 0.0, "step": 2350, "text_loss": 0.4093080461025238 @@ -22342,13 +22342,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.047607421875, "learning_rate": 0.0009201621664540747, "loss": 0.0155, "macro_f1": 0.6666666865348816, "num_tokens": 3794134.0, "repeat_count": 1.0, - "routers_loss": 0.005288597662001848, + "routers_loss": 0.005159572698175907, "skip_count": 0.0, "step": 2352, "text_loss": 0.5451981425285339 @@ -22361,13 +22361,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009199943020229694, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3797414.0, "repeat_count": 0.0, - "routers_loss": 0.002237799344584346, + "routers_loss": 0.002356168581172824, "skip_count": 0.0, "step": 2354, "text_loss": 0.3070453405380249 @@ -22380,13 +22380,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0810546875, "learning_rate": 0.0009198262766492554, - "loss": 0.0144, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 3800094.0, "repeat_count": 0.0, - "routers_loss": 0.006226782687008381, + "routers_loss": 0.0051761893555521965, "skip_count": 1.0, "step": 2356, "text_loss": 0.5880904197692871 @@ -22399,13 +22399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.049560546875, "learning_rate": 0.00091965809039732, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3803280.0, "repeat_count": 0.0, - "routers_loss": 0.0027645498048514128, + "routers_loss": 0.0025952060241252184, "skip_count": 0.0, "step": 2358, "text_loss": 0.5210731625556946 @@ -22418,13 +22418,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009194897433316127, - "loss": 0.0122, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 3805866.0, "repeat_count": 0.0, - "routers_loss": 0.0034913592971861362, + "routers_loss": 0.0042560105212032795, "skip_count": 2.0, "step": 2360, "text_loss": 0.6472984552383423 @@ -22437,13 +22437,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07568359375, "learning_rate": 0.0009193212355166446, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3808952.0, "repeat_count": 0.0, - "routers_loss": 0.002706601284444332, + "routers_loss": 0.0026232977397739887, "skip_count": 0.0, "step": 2362, "text_loss": 0.450063556432724 @@ -22456,13 +22456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009191525670169881, - "loss": 0.0108, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3812080.0, "repeat_count": 0.0, - "routers_loss": 0.0032696903217583895, + "routers_loss": 0.0034355956595391035, "skip_count": 0.0, "step": 2364, "text_loss": 0.49727216362953186 @@ -22475,13 +22475,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.05908203125, "learning_rate": 0.000918983737897277, - "loss": 0.0115, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3815282.0, "repeat_count": 0.0, - "routers_loss": 0.006245410069823265, + "routers_loss": 0.0055653867311775684, "skip_count": 1.0, "step": 2366, "text_loss": 0.6336377859115601 @@ -22496,11 +22496,11 @@ "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0009188147482222071, - "loss": 0.0079, + "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3818106.0, "repeat_count": 2.0, - "routers_loss": 0.011230813339352608, + "routers_loss": 0.011016021482646465, "skip_count": 2.0, "step": 2368, "text_loss": 0.22513329982757568 @@ -22515,11 +22515,11 @@ "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009186455980565358, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3821228.0, "repeat_count": 1.0, - "routers_loss": 0.014897257089614868, + "routers_loss": 0.014039464294910431, "skip_count": 0.0, "step": 2370, "text_loss": 0.21331638097763062 @@ -22532,13 +22532,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.062255859375, "learning_rate": 0.0009184762874650816, - "loss": 0.0131, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3825048.0, "repeat_count": 0.0, - "routers_loss": 0.0015503648901358247, + "routers_loss": 0.001088051125407219, "skip_count": 0.0, "step": 2372, "text_loss": 0.6031543612480164 @@ -22551,13 +22551,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.091796875, + "grad_norm": 0.095703125, "learning_rate": 0.0009183068165127245, - "loss": 0.0127, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3828781.0, "repeat_count": 0.0, - "routers_loss": 0.00723480898886919, + "routers_loss": 0.006263940595090389, "skip_count": 1.0, "step": 2374, "text_loss": 0.6249601244926453 @@ -22570,13 +22570,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.06982421875, "learning_rate": 0.0009181371852644062, - "loss": 0.0139, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 3832507.0, "repeat_count": 1.0, - "routers_loss": 0.002053398173302412, + "routers_loss": 0.001987969037145376, "skip_count": 0.0, "step": 2376, "text_loss": 0.37972065806388855 @@ -22589,32 +22589,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.0908203125, "learning_rate": 0.0009179673937851299, "loss": 0.0158, "macro_f1": 0.6666666865348816, "num_tokens": 3835644.0, "repeat_count": 0.0, - "routers_loss": 0.007927518337965012, + "routers_loss": 0.007635094691067934, "skip_count": 1.0, "step": 2378, "text_loss": 0.46319663524627686 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, "learning_rate": 0.0009177974421399598, - "loss": 0.0144, - "macro_f1": 0.5555555820465088, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, "num_tokens": 3838700.0, "repeat_count": 0.0, - "routers_loss": 0.01924682781100273, + "routers_loss": 0.01617279462516308, "skip_count": 2.0, "step": 2380, "text_loss": 0.32141056656837463 @@ -22627,13 +22627,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046875, + "grad_norm": 0.056396484375, "learning_rate": 0.0009176273303940217, - "loss": 0.0106, + "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 3841953.0, "repeat_count": 0.0, - "routers_loss": 0.0021689811255782843, + "routers_loss": 0.0022273799404501915, "skip_count": 2.0, "step": 2382, "text_loss": 0.5908139944076538 @@ -22646,13 +22646,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.0615234375, "learning_rate": 0.0009174570586125026, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.32098767161369324, "num_tokens": 3845763.0, "repeat_count": 1.0, - "routers_loss": 0.03431013971567154, + "routers_loss": 0.030915161594748497, "skip_count": 0.0, "step": 2384, "text_loss": 0.41400137543678284 @@ -22665,13 +22665,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009172866268606513, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3848984.0, "repeat_count": 0.0, - "routers_loss": 0.008275258354842663, + "routers_loss": 0.010480951517820358, "skip_count": 2.0, "step": 2386, "text_loss": 0.2560874819755554 @@ -22684,13 +22684,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04736328125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009171160352037775, - "loss": 0.0121, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3852118.0, "repeat_count": 0.0, - "routers_loss": 0.007780806161463261, + "routers_loss": 0.00809961836785078, "skip_count": 1.0, "step": 2388, "text_loss": 0.28236693143844604 @@ -22709,7 +22709,7 @@ "macro_f1": 1.0, "num_tokens": 3855314.0, "repeat_count": 1.0, - "routers_loss": 0.00553786288946867, + "routers_loss": 0.005569872446358204, "skip_count": 1.0, "step": 2390, "text_loss": 0.4578137695789337 @@ -22722,13 +22722,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009167743724365073, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3858301.0, "repeat_count": 0.0, - "routers_loss": 0.004066115710884333, + "routers_loss": 0.0038610948249697685, "skip_count": 1.0, "step": 2392, "text_loss": 0.14082716405391693 @@ -22741,13 +22741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009166033014570368, - "loss": 0.0104, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3861296.0, "repeat_count": 0.0, - "routers_loss": 0.002403446938842535, + "routers_loss": 0.0017607157351449132, "skip_count": 0.0, "step": 2394, "text_loss": 0.384442001581192 @@ -22760,13 +22760,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009164320708343954, - "loss": 0.0137, + "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 3863985.0, "repeat_count": 2.0, - "routers_loss": 0.010212135501205921, + "routers_loss": 0.009627950377762318, "skip_count": 0.0, "step": 2396, "text_loss": 0.6969521045684814 @@ -22779,13 +22779,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009162606806341989, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 3866636.0, "repeat_count": 0.0, - "routers_loss": 0.007781816180795431, + "routers_loss": 0.006915586534887552, "skip_count": 0.0, "step": 2398, "text_loss": 0.48069697618484497 @@ -22798,32 +22798,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009160891309221242, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3870867.0, "repeat_count": 1.0, - "routers_loss": 0.0016227158484980464, + "routers_loss": 0.0013031222624704242, "skip_count": 0.0, "step": 2400, "text_loss": 0.3882075846195221 }, { "acc_repeat": 0.5, - "acc_skip": 1.0, - "avg_layers": 28.0, + "acc_skip": 0.0, + "avg_layers": 29.0, "epoch": 11.277076606985618, - "f1_execute": 0.9803921580314636, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, - "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "f1_skip": 0.0, + "grad_norm": 0.06640625, "learning_rate": 0.0009159174217639096, - "loss": 0.0114, - "macro_f1": 0.8823530077934265, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, "num_tokens": 3873663.0, "repeat_count": 2.0, - "routers_loss": 0.06490851938724518, + "routers_loss": 0.06621067970991135, "skip_count": 1.0, "step": 2402, "text_loss": 0.5740041136741638 @@ -22836,13 +22836,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.03662109375, "learning_rate": 0.0009157455532253547, - "loss": 0.0075, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3876788.0, "repeat_count": 1.0, - "routers_loss": 0.007105287164449692, + "routers_loss": 0.005957918707281351, "skip_count": 0.0, "step": 2404, "text_loss": 0.26025933027267456 @@ -22855,13 +22855,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.06787109375, + "grad_norm": 0.08642578125, "learning_rate": 0.0009155735253723191, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.9452888369560242, "num_tokens": 3879942.0, "repeat_count": 1.0, - "routers_loss": 0.03736003860831261, + "routers_loss": 0.039429809898138046, "skip_count": 4.0, "step": 2406, "text_loss": 1.1349908113479614 @@ -22874,13 +22874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.047607421875, "learning_rate": 0.0009154013382707251, - "loss": 0.011, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3882682.0, "repeat_count": 0.0, - "routers_loss": 0.0012925176415592432, + "routers_loss": 0.0012570557883009315, "skip_count": 0.0, "step": 2408, "text_loss": 0.5611135363578796 @@ -22895,11 +22895,11 @@ "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0009152289919865543, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3886425.0, "repeat_count": 0.0, - "routers_loss": 0.001746711554005742, + "routers_loss": 0.0017455556662753224, "skip_count": 0.0, "step": 2410, "text_loss": 0.7523751854896545 @@ -22912,13 +22912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.04052734375, "learning_rate": 0.0009150564865858506, - "loss": 0.0112, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3889273.0, "repeat_count": 0.0, - "routers_loss": 0.011005193926393986, + "routers_loss": 0.011178011074662209, "skip_count": 1.0, "step": 2412, "text_loss": 0.26942551136016846 @@ -22931,13 +22931,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009148838221347182, - "loss": 0.0102, + "loss": 0.0107, "macro_f1": 0.5934640765190125, "num_tokens": 3892199.0, "repeat_count": 3.0, - "routers_loss": 0.017795369029045105, + "routers_loss": 0.019628092646598816, "skip_count": 0.0, "step": 2414, "text_loss": 0.5492315888404846 @@ -22950,13 +22950,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.04541015625, "learning_rate": 0.0009147109986993225, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3895362.0, "repeat_count": 1.0, - "routers_loss": 0.011693861335515976, + "routers_loss": 0.012255983427166939, "skip_count": 0.0, "step": 2416, "text_loss": 0.23798216879367828 @@ -22969,13 +22969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009145380163458899, - "loss": 0.0177, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3898476.0, "repeat_count": 0.0, - "routers_loss": 0.007135285064578056, + "routers_loss": 0.007018954027444124, "skip_count": 0.0, "step": 2418, "text_loss": 0.1923145055770874 @@ -22988,13 +22988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.03369140625, "learning_rate": 0.0009143648751407074, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3901817.0, "repeat_count": 0.0, - "routers_loss": 0.0008607010240666568, + "routers_loss": 0.0008574824314564466, "skip_count": 0.0, "step": 2420, "text_loss": 0.4001806974411011 @@ -23007,13 +23007,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07861328125, + "grad_norm": 0.11328125, "learning_rate": 0.0009141915751501231, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 3905461.0, "repeat_count": 0.0, - "routers_loss": 0.015359465964138508, + "routers_loss": 0.01572350226342678, "skip_count": 2.0, "step": 2422, "text_loss": 0.19519129395484924 @@ -23026,13 +23026,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037353515625, "learning_rate": 0.0009140181164405458, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3908878.0, "repeat_count": 0.0, - "routers_loss": 0.00047823251225054264, + "routers_loss": 0.0005503420252352953, "skip_count": 0.0, "step": 2424, "text_loss": 0.6937088370323181 @@ -23047,11 +23047,11 @@ "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009138444990784454, - "loss": 0.0129, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3912053.0, "repeat_count": 0.0, - "routers_loss": 0.0070601715706288815, + "routers_loss": 0.007556677330285311, "skip_count": 0.0, "step": 2426, "text_loss": 0.35431069135665894 @@ -23064,13 +23064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.06201171875, "learning_rate": 0.000913670723130352, - "loss": 0.0123, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3915192.0, "repeat_count": 0.0, - "routers_loss": 0.0010537977796047926, + "routers_loss": 0.0013609991874545813, "skip_count": 0.0, "step": 2428, "text_loss": 0.5171207189559937 @@ -23083,13 +23083,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.050048828125, "learning_rate": 0.0009134967886628573, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 3917927.0, "repeat_count": 2.0, - "routers_loss": 0.012852456420660019, + "routers_loss": 0.010895746760070324, "skip_count": 2.0, "step": 2430, "text_loss": 0.2852934002876282 @@ -23102,13 +23102,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.062255859375, "learning_rate": 0.0009133226957426133, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.5492662787437439, "num_tokens": 3921460.0, "repeat_count": 2.0, - "routers_loss": 0.05307198315858841, + "routers_loss": 0.04196908697485924, "skip_count": 0.0, "step": 2432, "text_loss": 0.4864770770072937 @@ -23121,13 +23121,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009131484444363324, - "loss": 0.0154, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3924662.0, "repeat_count": 0.0, - "routers_loss": 0.004656757228076458, + "routers_loss": 0.004484197124838829, "skip_count": 0.0, "step": 2434, "text_loss": 0.7568684220314026 @@ -23140,13 +23140,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.05078125, "learning_rate": 0.0009129740348107882, - "loss": 0.0113, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3927337.0, "repeat_count": 0.0, - "routers_loss": 0.0042406003922224045, + "routers_loss": 0.004351360257714987, "skip_count": 2.0, "step": 2436, "text_loss": 0.5953161716461182 @@ -23159,13 +23159,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0517578125, + "grad_norm": 0.04736328125, "learning_rate": 0.0009127994669328151, - "loss": 0.0089, + "loss": 0.0085, "macro_f1": 0.6122449040412903, "num_tokens": 3930407.0, "repeat_count": 0.0, - "routers_loss": 0.018079286441206932, + "routers_loss": 0.01664198748767376, "skip_count": 4.0, "step": 2438, "text_loss": 0.5320524573326111 @@ -23178,13 +23178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0595703125, "learning_rate": 0.0009126247408693071, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3933184.0, "repeat_count": 0.0, - "routers_loss": 0.002266801195219159, + "routers_loss": 0.0017819046042859554, "skip_count": 1.0, "step": 2440, "text_loss": 0.6051273345947266 @@ -23197,13 +23197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.0009124498566872204, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3936620.0, "repeat_count": 0.0, - "routers_loss": 0.005790423136204481, + "routers_loss": 0.005519696045666933, "skip_count": 0.0, "step": 2442, "text_loss": 0.12987950444221497 @@ -23216,13 +23216,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052734375, + "grad_norm": 0.052490234375, "learning_rate": 0.0009122748144535704, - "loss": 0.011, + "loss": 0.0111, "macro_f1": 0.32098764181137085, "num_tokens": 3940010.0, "repeat_count": 0.0, - "routers_loss": 0.04591076448559761, + "routers_loss": 0.04543351009488106, "skip_count": 2.0, "step": 2444, "text_loss": 0.4642033576965332 @@ -23235,13 +23235,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.04296875, "learning_rate": 0.0009120996142354338, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3943135.0, "repeat_count": 0.0, - "routers_loss": 0.004969341680407524, + "routers_loss": 0.00550565542653203, "skip_count": 0.0, "step": 2446, "text_loss": 0.5697627067565918 @@ -23254,13 +23254,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0009119242560999477, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3946650.0, "repeat_count": 0.0, - "routers_loss": 0.00830315612256527, + "routers_loss": 0.008842485956847668, "skip_count": 0.0, "step": 2448, "text_loss": 0.17046524584293365 @@ -23273,13 +23273,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009117487401143095, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 3949470.0, "repeat_count": 1.0, - "routers_loss": 0.0059144929982721806, + "routers_loss": 0.005900127813220024, "skip_count": 0.0, "step": 2450, "text_loss": 0.37260866165161133 @@ -23292,13 +23292,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.035400390625, "learning_rate": 0.0009115730663457773, - "loss": 0.0132, + "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3952546.0, "repeat_count": 1.0, - "routers_loss": 0.0029762545600533485, + "routers_loss": 0.003409258322790265, "skip_count": 1.0, "step": 2452, "text_loss": 0.5308008193969727 @@ -23311,13 +23311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.05224609375, "learning_rate": 0.0009113972348616698, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 3955817.0, "repeat_count": 0.0, - "routers_loss": 0.011962058953940868, + "routers_loss": 0.010098597034811974, "skip_count": 1.0, "step": 2454, "text_loss": 0.39226648211479187 @@ -23330,13 +23330,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1640625, "learning_rate": 0.0009112212457293658, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 3958911.0, "repeat_count": 0.0, - "routers_loss": 0.07289884239435196, + "routers_loss": 0.08184818178415298, "skip_count": 0.0, "step": 2456, "text_loss": 0.45411455631256104 @@ -23349,13 +23349,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009110450990163047, - "loss": 0.0124, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3962584.0, "repeat_count": 0.0, - "routers_loss": 0.0009638209594413638, + "routers_loss": 0.0009352223132736981, "skip_count": 0.0, "step": 2458, "text_loss": 0.47292324900627136 @@ -23368,13 +23368,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0009108687947899863, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3965597.0, "repeat_count": 1.0, - "routers_loss": 0.008587516844272614, + "routers_loss": 0.008150188252329826, "skip_count": 2.0, "step": 2460, "text_loss": 0.33208340406417847 @@ -23387,13 +23387,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04150390625, + "grad_norm": 0.043212890625, "learning_rate": 0.0009106923331179707, - "loss": 0.0126, + "loss": 0.0125, "macro_f1": 0.5492662787437439, "num_tokens": 3968664.0, "repeat_count": 0.0, - "routers_loss": 0.05080332234501839, + "routers_loss": 0.050999004393815994, "skip_count": 2.0, "step": 2462, "text_loss": 0.2459995150566101 @@ -23406,13 +23406,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0693359375, "learning_rate": 0.0009105157140678782, - "loss": 0.0124, + "loss": 0.0126, "macro_f1": 0.6666666865348816, "num_tokens": 3971772.0, "repeat_count": 0.0, - "routers_loss": 0.007348654326051474, + "routers_loss": 0.006196586415171623, "skip_count": 1.0, "step": 2464, "text_loss": 0.23956991732120514 @@ -23425,13 +23425,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009103389377073896, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 3976224.0, "repeat_count": 0.0, - "routers_loss": 0.007161752786487341, + "routers_loss": 0.008181816898286343, "skip_count": 0.0, "step": 2466, "text_loss": 0.3235875070095062 @@ -23444,13 +23444,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.057373046875, "learning_rate": 0.0009101620041042462, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3978876.0, "repeat_count": 0.0, - "routers_loss": 0.0015090530505403876, + "routers_loss": 0.0015451472718268633, "skip_count": 0.0, "step": 2468, "text_loss": 0.4038759469985962 @@ -23463,13 +23463,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.09130859375, "learning_rate": 0.000909984913326249, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3272727429866791, "num_tokens": 3981992.0, "repeat_count": 0.0, - "routers_loss": 0.021420184522867203, + "routers_loss": 0.021785033866763115, "skip_count": 1.0, "step": 2470, "text_loss": 0.6346460580825806 @@ -23482,13 +23482,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0712890625, "learning_rate": 0.0009098076654412595, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3984560.0, "repeat_count": 0.0, - "routers_loss": 0.0010742908343672752, + "routers_loss": 0.0011462471447885036, "skip_count": 0.0, "step": 2472, "text_loss": 0.3449646532535553 @@ -23501,13 +23501,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05078125, + "grad_norm": 0.049560546875, "learning_rate": 0.0009096302605171996, - "loss": 0.011, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3987548.0, "repeat_count": 0.0, - "routers_loss": 0.0015209210105240345, + "routers_loss": 0.0014367027906700969, "skip_count": 0.0, "step": 2474, "text_loss": 0.5918350219726562 @@ -23520,13 +23520,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0478515625, "learning_rate": 0.0009094526986220513, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3990727.0, "repeat_count": 0.0, - "routers_loss": 0.0008761848439462483, + "routers_loss": 0.0008977655088528991, "skip_count": 0.0, "step": 2476, "text_loss": 0.463350385427475 @@ -23539,13 +23539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.049072265625, "learning_rate": 0.0009092749798238563, - "loss": 0.0146, + "loss": 0.015, "macro_f1": 0.3272727429866791, "num_tokens": 3993757.0, "repeat_count": 1.0, - "routers_loss": 0.01623794063925743, + "routers_loss": 0.016712551936507225, "skip_count": 0.0, "step": 2478, "text_loss": 0.5621229410171509 @@ -23558,13 +23558,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.06640625, "learning_rate": 0.000909097104190717, - "loss": 0.0174, + "loss": 0.0172, "macro_f1": 0.32098764181137085, "num_tokens": 3997259.0, "repeat_count": 0.0, - "routers_loss": 0.04170118644833565, + "routers_loss": 0.04134179651737213, "skip_count": 2.0, "step": 2480, "text_loss": 0.375476598739624 @@ -23577,32 +23577,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.044677734375, "learning_rate": 0.0009089190717907956, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4000563.0, "repeat_count": 0.0, - "routers_loss": 0.003591755870729685, + "routers_loss": 0.003462378401309252, "skip_count": 0.0, "step": 2482, "text_loss": 0.5553798675537109 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.66216612855885, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0693359375, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, "learning_rate": 0.0009087408826923146, - "loss": 0.0185, - "macro_f1": 0.5492662787437439, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, "num_tokens": 4004065.0, "repeat_count": 0.0, - "routers_loss": 0.009214848279953003, + "routers_loss": 0.008057428523898125, "skip_count": 2.0, "step": 2484, "text_loss": 0.4329465329647064 @@ -23615,13 +23615,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.050048828125, "learning_rate": 0.0009085625369635564, - "loss": 0.0111, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4007119.0, "repeat_count": 0.0, - "routers_loss": 0.0059350160881876945, + "routers_loss": 0.005759050603955984, "skip_count": 0.0, "step": 2486, "text_loss": 0.501268744468689 @@ -23634,13 +23634,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1240234375, "learning_rate": 0.0009083840346728631, - "loss": 0.0118, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 4010547.0, "repeat_count": 1.0, - "routers_loss": 0.019803427159786224, + "routers_loss": 0.020763102918863297, "skip_count": 0.0, "step": 2488, "text_loss": 0.480196475982666 @@ -23653,13 +23653,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05078125, "learning_rate": 0.0009082053758886374, - "loss": 0.0118, + "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 4014600.0, "repeat_count": 0.0, - "routers_loss": 0.006243673153221607, + "routers_loss": 0.005801836494356394, "skip_count": 1.0, "step": 2490, "text_loss": 0.18249782919883728 @@ -23672,13 +23672,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009080265606793416, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 4017964.0, "repeat_count": 1.0, - "routers_loss": 0.003960726782679558, + "routers_loss": 0.004226063843816519, "skip_count": 1.0, "step": 2492, "text_loss": 0.6573076248168945 @@ -23691,13 +23691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.049072265625, "learning_rate": 0.000907847589113498, - "loss": 0.0127, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 4020694.0, "repeat_count": 0.0, - "routers_loss": 0.004959117621183395, + "routers_loss": 0.004281101748347282, "skip_count": 2.0, "step": 2494, "text_loss": 0.3944586217403412 @@ -23710,13 +23710,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.000907668461259689, - "loss": 0.0157, + "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 4023757.0, "repeat_count": 0.0, - "routers_loss": 0.009721433743834496, + "routers_loss": 0.008786370046436787, "skip_count": 1.0, "step": 2496, "text_loss": 0.6452898979187012 @@ -23729,13 +23729,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0693359375, "learning_rate": 0.0009074891771865566, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4026601.0, "repeat_count": 0.0, - "routers_loss": 0.00491701066493988, + "routers_loss": 0.005209595896303654, "skip_count": 0.0, "step": 2498, "text_loss": 0.9633619785308838 @@ -23748,13 +23748,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.03759765625, "learning_rate": 0.0009073097369628028, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 1.0, "num_tokens": 4030321.0, "repeat_count": 3.0, - "routers_loss": 0.009832080453634262, + "routers_loss": 0.00860709697008133, "skip_count": 1.0, "step": 2500, "text_loss": 0.48566827178001404 @@ -23767,13 +23767,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.04443359375, "learning_rate": 0.0009071301406571893, - "loss": 0.0137, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4033234.0, "repeat_count": 0.0, - "routers_loss": 0.003301833290606737, + "routers_loss": 0.0035277456045150757, "skip_count": 0.0, "step": 2502, "text_loss": 0.3771554231643677 @@ -23786,13 +23786,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.000906950388338538, - "loss": 0.0134, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 4036417.0, "repeat_count": 0.0, - "routers_loss": 0.001580960932187736, + "routers_loss": 0.0013424850767478347, "skip_count": 0.0, "step": 2504, "text_loss": 0.8962806463241577 @@ -23805,13 +23805,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009067704800757301, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4039564.0, "repeat_count": 0.0, - "routers_loss": 0.0011505817528814077, + "routers_loss": 0.0010423909407109022, "skip_count": 0.0, "step": 2506, "text_loss": 0.43170279264450073 @@ -23824,13 +23824,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.000906590415937707, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 4043212.0, "repeat_count": 0.0, - "routers_loss": 0.023224346339702606, + "routers_loss": 0.021780289709568024, "skip_count": 1.0, "step": 2508, "text_loss": 0.41495826840400696 @@ -23843,13 +23843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.0341796875, "learning_rate": 0.0009064101959934696, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4046687.0, "repeat_count": 0.0, - "routers_loss": 0.007955167442560196, + "routers_loss": 0.007261929102241993, "skip_count": 1.0, "step": 2510, "text_loss": 0.21821187436580658 @@ -23862,13 +23862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.057861328125, "learning_rate": 0.0009062298203120783, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4050735.0, "repeat_count": 0.0, - "routers_loss": 0.006164440419524908, + "routers_loss": 0.007447180338203907, "skip_count": 2.0, "step": 2512, "text_loss": 0.1818767935037613 @@ -23881,13 +23881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.06494140625, "learning_rate": 0.0009060492889626535, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3272727429866791, "num_tokens": 4054426.0, "repeat_count": 1.0, - "routers_loss": 0.0713663101196289, + "routers_loss": 0.0718490406870842, "skip_count": 0.0, "step": 2514, "text_loss": 0.22798970341682434 @@ -23900,13 +23900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.099609375, "learning_rate": 0.0009058686020143753, - "loss": 0.0182, + "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 4057615.0, "repeat_count": 0.0, - "routers_loss": 0.0052308146841824055, + "routers_loss": 0.0052676633931696415, "skip_count": 0.0, "step": 2516, "text_loss": 0.1712338626384735 @@ -23919,13 +23919,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.0380859375, "learning_rate": 0.0009056877595364832, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 4060338.0, "repeat_count": 0.0, - "routers_loss": 0.0020465939305722713, + "routers_loss": 0.0018052728846669197, "skip_count": 0.0, "step": 2518, "text_loss": 0.6811438798904419 @@ -23938,13 +23938,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.083984375, "learning_rate": 0.0009055067615982761, - "loss": 0.0114, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4062887.0, "repeat_count": 0.0, - "routers_loss": 0.0008663221378810704, + "routers_loss": 0.0009029926732182503, "skip_count": 0.0, "step": 2520, "text_loss": 0.5480356812477112 @@ -23957,13 +23957,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.051025390625, "learning_rate": 0.0009053256082691133, - "loss": 0.0104, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 4065357.0, "repeat_count": 0.0, - "routers_loss": 0.0026889131404459476, + "routers_loss": 0.0027515271212905645, "skip_count": 0.0, "step": 2522, "text_loss": 0.5234101414680481 @@ -23978,11 +23978,11 @@ "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009051442996184127, - "loss": 0.0181, + "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 4068111.0, "repeat_count": 0.0, - "routers_loss": 0.002255887258797884, + "routers_loss": 0.002199822571128607, "skip_count": 0.0, "step": 2524, "text_loss": 0.2418575882911682 @@ -23995,13 +23995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060546875, + "grad_norm": 0.0625, "learning_rate": 0.0009049628357156521, - "loss": 0.0144, + "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 4071284.0, "repeat_count": 0.0, - "routers_loss": 0.005672316066920757, + "routers_loss": 0.006303096655756235, "skip_count": 2.0, "step": 2526, "text_loss": 0.7948065996170044 @@ -24014,13 +24014,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037841796875, "learning_rate": 0.000904781216630369, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 4074750.0, "repeat_count": 1.0, - "routers_loss": 0.017167411744594574, + "routers_loss": 0.01791904680430889, "skip_count": 2.0, "step": 2528, "text_loss": 0.809726357460022 @@ -24033,13 +24033,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009045994424321602, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4078617.0, "repeat_count": 2.0, - "routers_loss": 0.019105618819594383, + "routers_loss": 0.016553178429603577, "skip_count": 2.0, "step": 2530, "text_loss": 0.8755000829696655 @@ -24052,13 +24052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.061767578125, "learning_rate": 0.0009044175131906817, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 4080936.0, "repeat_count": 0.0, - "routers_loss": 0.007993129082024097, + "routers_loss": 0.00884837657213211, "skip_count": 0.0, "step": 2532, "text_loss": 0.795871913433075 @@ -24071,13 +24071,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.05029296875, "learning_rate": 0.0009042354289756491, - "loss": 0.0124, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4084459.0, "repeat_count": 0.0, - "routers_loss": 0.0024954001419246197, + "routers_loss": 0.0024387789890170097, "skip_count": 0.0, "step": 2534, "text_loss": 0.18875400722026825 @@ -24090,13 +24090,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0625, "learning_rate": 0.0009040531898568379, - "loss": 0.0169, + "loss": 0.0171, "macro_f1": 0.3333333432674408, "num_tokens": 4088464.0, "repeat_count": 0.0, - "routers_loss": 0.004360117018222809, + "routers_loss": 0.00491489190608263, "skip_count": 0.0, "step": 2536, "text_loss": 0.334369033575058 @@ -24109,13 +24109,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.091796875, "learning_rate": 0.000903870795904082, - "loss": 0.0142, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 4091659.0, "repeat_count": 0.0, - "routers_loss": 0.00429064966738224, + "routers_loss": 0.004592662677168846, "skip_count": 2.0, "step": 2538, "text_loss": 0.21298295259475708 @@ -24130,11 +24130,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.000903688247187275, - "loss": 0.0136, + "loss": 0.0137, "macro_f1": 0.5492662787437439, "num_tokens": 4095496.0, "repeat_count": 0.0, - "routers_loss": 0.0132954316213727, + "routers_loss": 0.011647242121398449, "skip_count": 2.0, "step": 2540, "text_loss": 0.2985081672668457 @@ -24147,13 +24147,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009035055437763704, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4098663.0, "repeat_count": 0.0, - "routers_loss": 0.002104961546137929, + "routers_loss": 0.0021238960325717926, "skip_count": 0.0, "step": 2542, "text_loss": 0.35359489917755127 @@ -24166,13 +24166,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.05859375, "learning_rate": 0.0009033226857413803, - "loss": 0.0167, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 4101588.0, "repeat_count": 1.0, - "routers_loss": 0.002973714144900441, + "routers_loss": 0.0024701557122170925, "skip_count": 0.0, "step": 2544, "text_loss": 1.1577601432800293 @@ -24185,13 +24185,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.080078125, "learning_rate": 0.000903139673152376, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4104643.0, "repeat_count": 0.0, - "routers_loss": 0.002359170001000166, + "routers_loss": 0.002499542199075222, "skip_count": 0.0, "step": 2546, "text_loss": 1.0173401832580566 @@ -24204,13 +24204,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.059814453125, "learning_rate": 0.0009029565060794885, - "loss": 0.0168, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 4109247.0, "repeat_count": 0.0, - "routers_loss": 0.0033595687709748745, + "routers_loss": 0.0034200598020106554, "skip_count": 0.0, "step": 2548, "text_loss": 0.5690504312515259 @@ -24223,13 +24223,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07421875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009027731845929079, "loss": 0.0155, "macro_f1": 0.8823530077934265, "num_tokens": 4112597.0, "repeat_count": 1.0, - "routers_loss": 0.015323673374950886, + "routers_loss": 0.015981333330273628, "skip_count": 1.0, "step": 2550, "text_loss": 0.294549822807312 @@ -24242,13 +24242,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.043212890625, + "grad_norm": 0.06103515625, "learning_rate": 0.0009025897087628829, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 4115844.0, "repeat_count": 0.0, - "routers_loss": 0.02122018299996853, + "routers_loss": 0.02606951631605625, "skip_count": 2.0, "step": 2552, "text_loss": 0.22692419588565826 @@ -24261,13 +24261,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.080078125, "learning_rate": 0.0009024060786597222, "loss": 0.0202, "macro_f1": 0.3333333432674408, "num_tokens": 4118634.0, "repeat_count": 0.0, - "routers_loss": 0.0010765352053567767, + "routers_loss": 0.001026194542646408, "skip_count": 0.0, "step": 2554, "text_loss": 0.6807059645652771 @@ -24280,13 +24280,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.04638671875, "learning_rate": 0.000902222294353793, - "loss": 0.0128, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4122024.0, "repeat_count": 0.0, - "routers_loss": 0.0017301233019679785, + "routers_loss": 0.001974924933165312, "skip_count": 0.0, "step": 2556, "text_loss": 0.7373668551445007 @@ -24299,13 +24299,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.04833984375, "learning_rate": 0.0009020383559155219, - "loss": 0.0056, + "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4124803.0, "repeat_count": 1.0, - "routers_loss": 0.004307204391807318, + "routers_loss": 0.004662613850086927, "skip_count": 2.0, "step": 2558, "text_loss": 0.21808166801929474 @@ -24318,13 +24318,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.0263671875, "learning_rate": 0.0009018542634153943, - "loss": 0.0064, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 4127680.0, "repeat_count": 0.0, - "routers_loss": 0.0073805381543934345, + "routers_loss": 0.006881687790155411, "skip_count": 0.0, "step": 2560, "text_loss": 0.25192978978157043 @@ -24339,11 +24339,11 @@ "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009016700169239551, - "loss": 0.0108, + "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 4130431.0, "repeat_count": 1.0, - "routers_loss": 0.005493874195963144, + "routers_loss": 0.005977808032184839, "skip_count": 1.0, "step": 2562, "text_loss": 0.4700816869735718 @@ -24356,13 +24356,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.068359375, "learning_rate": 0.0009014856165118075, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 4133535.0, "repeat_count": 0.0, - "routers_loss": 0.006889877840876579, + "routers_loss": 0.007005698047578335, "skip_count": 1.0, "step": 2564, "text_loss": 0.6558199524879456 @@ -24375,13 +24375,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.030517578125, "learning_rate": 0.0009013010622496144, - "loss": 0.009, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4136534.0, "repeat_count": 0.0, - "routers_loss": 0.008495541289448738, + "routers_loss": 0.007262171246111393, "skip_count": 0.0, "step": 2566, "text_loss": 0.2565421462059021 @@ -24394,13 +24394,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.044921875, + "grad_norm": 0.043212890625, "learning_rate": 0.0009011163542080971, - "loss": 0.0089, + "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 4139762.0, "repeat_count": 0.0, - "routers_loss": 0.05929862707853317, + "routers_loss": 0.05431923270225525, "skip_count": 3.0, "step": 2568, "text_loss": 0.19896510243415833 @@ -24413,13 +24413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.026611328125, "learning_rate": 0.0009009314924580363, - "loss": 0.0086, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4143398.0, "repeat_count": 0.0, - "routers_loss": 0.0033934004604816437, + "routers_loss": 0.003667369019240141, "skip_count": 0.0, "step": 2570, "text_loss": 0.6581419110298157 @@ -24432,13 +24432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009007464770702712, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4146248.0, "repeat_count": 0.0, - "routers_loss": 0.0012826769379898906, + "routers_loss": 0.00132099783513695, "skip_count": 0.0, "step": 2572, "text_loss": 0.5316711068153381 @@ -24451,13 +24451,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038818359375, "learning_rate": 0.0009005613081157002, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4149455.0, "repeat_count": 0.0, - "routers_loss": 0.0019460092298686504, + "routers_loss": 0.0020061524119228125, "skip_count": 0.0, "step": 2574, "text_loss": 0.5400773882865906 @@ -24470,13 +24470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.064453125, + "grad_norm": 0.05517578125, "learning_rate": 0.0009003759856652802, - "loss": 0.0112, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4152774.0, "repeat_count": 0.0, - "routers_loss": 0.004493138287216425, + "routers_loss": 0.002621434163302183, "skip_count": 1.0, "step": 2576, "text_loss": 0.3672606945037842 @@ -24489,13 +24489,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009001905097900273, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4155835.0, "repeat_count": 0.0, - "routers_loss": 0.005607665050774813, + "routers_loss": 0.005290219560265541, "skip_count": 0.0, "step": 2578, "text_loss": 0.8159038424491882 @@ -24508,13 +24508,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.040771484375, "learning_rate": 0.0009000048805610161, - "loss": 0.0123, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 4158874.0, "repeat_count": 0.0, - "routers_loss": 0.0015080278972163796, + "routers_loss": 0.0013576085912063718, "skip_count": 0.0, "step": 2580, "text_loss": 0.5518951416015625 @@ -24529,11 +24529,11 @@ "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.00089981909804938, - "loss": 0.0142, + "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 4162076.0, "repeat_count": 0.0, - "routers_loss": 0.0022276053205132484, + "routers_loss": 0.0021483441814780235, "skip_count": 0.0, "step": 2582, "text_loss": 0.43552228808403015 @@ -24546,13 +24546,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.07421875, + "grad_norm": 0.068359375, "learning_rate": 0.0008996331623263114, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.7795917987823486, "num_tokens": 4165041.0, "repeat_count": 1.0, - "routers_loss": 0.0499282106757164, + "routers_loss": 0.0544300302863121, "skip_count": 4.0, "step": 2584, "text_loss": 0.24812501668930054 @@ -24565,13 +24565,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.047607421875, "learning_rate": 0.0008994470734630611, - "loss": 0.01, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4168290.0, "repeat_count": 0.0, - "routers_loss": 0.0016360745066776872, + "routers_loss": 0.0017150711501017213, "skip_count": 0.0, "step": 2586, "text_loss": 0.6392097473144531 @@ -24584,32 +24584,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008992608315309388, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4171310.0, "repeat_count": 0.0, - "routers_loss": 0.0037772543728351593, + "routers_loss": 0.0046473173424601555, "skip_count": 2.0, "step": 2588, "text_loss": 0.6534156799316406 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 12.15967126504256, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, "learning_rate": 0.0008990744366013125, - "loss": 0.0104, - "macro_f1": 0.6538461446762085, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, "num_tokens": 4174042.0, "repeat_count": 2.0, - "routers_loss": 0.05992122367024422, + "routers_loss": 0.060913100838661194, "skip_count": 1.0, "step": 2590, "text_loss": 0.5365690588951111 @@ -24622,13 +24622,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.05859375, + "grad_norm": 0.055419921875, "learning_rate": 0.0008988878887456093, "loss": 0.0118, "macro_f1": 0.6051587462425232, "num_tokens": 4177666.0, "repeat_count": 1.0, - "routers_loss": 0.0679154023528099, + "routers_loss": 0.06268956512212753, "skip_count": 4.0, "step": 2592, "text_loss": 0.226226806640625 @@ -24643,11 +24643,11 @@ "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008987011880353149, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.32098764181137085, "num_tokens": 4180490.0, "repeat_count": 0.0, - "routers_loss": 0.03284052759408951, + "routers_loss": 0.030141465365886688, "skip_count": 2.0, "step": 2594, "text_loss": 0.2581401765346527 @@ -24660,13 +24660,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.051513671875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008985143345419729, - "loss": 0.0087, + "loss": 0.0082, "macro_f1": 0.5492662787437439, "num_tokens": 4183300.0, "repeat_count": 0.0, - "routers_loss": 0.01971421390771866, + "routers_loss": 0.018745863810181618, "skip_count": 2.0, "step": 2596, "text_loss": 0.7778542637825012 @@ -24679,13 +24679,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0703125, + "grad_norm": 0.064453125, "learning_rate": 0.0008983273283371862, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.5492662787437439, "num_tokens": 4186535.0, "repeat_count": 0.0, - "routers_loss": 0.028065117076039314, + "routers_loss": 0.026792079210281372, "skip_count": 2.0, "step": 2598, "text_loss": 0.34700271487236023 @@ -24698,13 +24698,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.048828125, "learning_rate": 0.0008981401694926159, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4189082.0, "repeat_count": 0.0, - "routers_loss": 0.00166845612693578, + "routers_loss": 0.001914160675369203, "skip_count": 0.0, "step": 2600, "text_loss": 0.6879339218139648 @@ -24717,13 +24717,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.06396484375, "learning_rate": 0.0008979528580799815, - "loss": 0.0138, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 4192330.0, "repeat_count": 0.0, - "routers_loss": 0.007527270819991827, + "routers_loss": 0.007978348061442375, "skip_count": 2.0, "step": 2602, "text_loss": 0.3524550497531891 @@ -24736,13 +24736,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008977653941710613, - "loss": 0.0137, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4196117.0, "repeat_count": 2.0, - "routers_loss": 0.00412185862660408, + "routers_loss": 0.0035376469604671, "skip_count": 0.0, "step": 2604, "text_loss": 0.42356348037719727 @@ -24755,13 +24755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.05810546875, "learning_rate": 0.0008975777778376916, - "loss": 0.0157, + "loss": 0.0156, "macro_f1": 0.6666666865348816, "num_tokens": 4200423.0, "repeat_count": 0.0, - "routers_loss": 0.007787751499563456, + "routers_loss": 0.008262477815151215, "skip_count": 1.0, "step": 2606, "text_loss": 0.5272893905639648 @@ -24774,13 +24774,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0732421875, "learning_rate": 0.0008973900091517675, "loss": 0.0114, "macro_f1": 0.3272727429866791, "num_tokens": 4203257.0, "repeat_count": 0.0, - "routers_loss": 0.024111779406666756, + "routers_loss": 0.022957922890782356, "skip_count": 1.0, "step": 2608, "text_loss": 0.2713734805583954 @@ -24793,13 +24793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.043701171875, "learning_rate": 0.000897202088185242, - "loss": 0.0091, + "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4206243.0, "repeat_count": 0.0, - "routers_loss": 0.0057326615788042545, + "routers_loss": 0.006623407825827599, "skip_count": 2.0, "step": 2610, "text_loss": 0.5920525789260864 @@ -24812,13 +24812,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0517578125, "learning_rate": 0.0008970140150101274, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4209264.0, "repeat_count": 0.0, - "routers_loss": 0.0008877563523128629, + "routers_loss": 0.0008602747693657875, "skip_count": 0.0, "step": 2612, "text_loss": 0.33421996235847473 @@ -24831,13 +24831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.030517578125, "learning_rate": 0.0008968257896984932, - "loss": 0.0067, + "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4212058.0, "repeat_count": 0.0, - "routers_loss": 0.0039034869987517595, + "routers_loss": 0.0024653903674334288, "skip_count": 1.0, "step": 2614, "text_loss": 0.37923356890678406 @@ -24850,13 +24850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06298828125, "learning_rate": 0.0008966374123224677, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4214929.0, "repeat_count": 0.0, - "routers_loss": 0.01140254084020853, + "routers_loss": 0.010878405533730984, "skip_count": 0.0, "step": 2616, "text_loss": 0.4350503981113434 @@ -24869,13 +24869,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.0303955078125, "learning_rate": 0.0008964488829542376, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4219170.0, "repeat_count": 0.0, - "routers_loss": 0.028559349477291107, + "routers_loss": 0.02864212542772293, "skip_count": 1.0, "step": 2618, "text_loss": 0.26250728964805603 @@ -24888,13 +24888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061279296875, + "grad_norm": 0.062255859375, "learning_rate": 0.0008962602016660478, - "loss": 0.0097, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4222077.0, "repeat_count": 0.0, - "routers_loss": 0.010525460354983807, + "routers_loss": 0.010444172658026218, "skip_count": 2.0, "step": 2620, "text_loss": 0.4718937575817108 @@ -24907,13 +24907,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008960713685302011, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4225383.0, "repeat_count": 0.0, - "routers_loss": 0.005284689832478762, + "routers_loss": 0.006409442983567715, "skip_count": 1.0, "step": 2622, "text_loss": 0.30420538783073425 @@ -24926,13 +24926,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.02978515625, "learning_rate": 0.0008958823836190588, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 4228349.0, "repeat_count": 0.0, - "routers_loss": 0.011040215380489826, + "routers_loss": 0.009996986016631126, "skip_count": 1.0, "step": 2624, "text_loss": 0.5392362475395203 @@ -24945,13 +24945,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.031494140625, "learning_rate": 0.0008956932470050404, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 4232007.0, "repeat_count": 0.0, - "routers_loss": 0.0014406041009351611, + "routers_loss": 0.0014383369125425816, "skip_count": 0.0, "step": 2626, "text_loss": 0.7112401127815247 @@ -24964,13 +24964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.072265625, + "grad_norm": 0.058349609375, "learning_rate": 0.0008955039587606233, - "loss": 0.0111, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4235122.0, "repeat_count": 0.0, - "routers_loss": 0.007106760982424021, + "routers_loss": 0.00781513936817646, "skip_count": 3.0, "step": 2628, "text_loss": 0.17802883684635162 @@ -24983,13 +24983,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0400390625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008953145189583429, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.542222261428833, "num_tokens": 4238248.0, "repeat_count": 0.0, - "routers_loss": 0.06423533707857132, + "routers_loss": 0.062252625823020935, "skip_count": 4.0, "step": 2630, "text_loss": 0.5551572442054749 @@ -25002,13 +25002,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0008951249276707933, - "loss": 0.012, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4241042.0, "repeat_count": 0.0, - "routers_loss": 0.0010294591775164008, + "routers_loss": 0.0011421777307987213, "skip_count": 0.0, "step": 2632, "text_loss": 0.7092233896255493 @@ -25021,13 +25021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008949351849706261, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4243939.0, "repeat_count": 0.0, - "routers_loss": 0.0032732547260820866, + "routers_loss": 0.0032689040526747704, "skip_count": 0.0, "step": 2634, "text_loss": 0.19925718009471893 @@ -25040,13 +25040,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.033935546875, "learning_rate": 0.0008947452909305509, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4247535.0, "repeat_count": 1.0, - "routers_loss": 0.0021109411027282476, + "routers_loss": 0.002066014800220728, "skip_count": 0.0, "step": 2636, "text_loss": 0.5249715447425842 @@ -25059,13 +25059,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11279296875, + "grad_norm": 0.09326171875, "learning_rate": 0.0008945552456233356, "loss": 0.0169, "macro_f1": 0.8820862174034119, "num_tokens": 4251441.0, "repeat_count": 2.0, - "routers_loss": 0.029545020312070847, + "routers_loss": 0.029332537204027176, "skip_count": 2.0, "step": 2638, "text_loss": 0.19229578971862793 @@ -25078,13 +25078,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.078125, "learning_rate": 0.0008943650491218058, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4254314.0, "repeat_count": 0.0, - "routers_loss": 0.0075805820524692535, + "routers_loss": 0.0075911120511591434, "skip_count": 0.0, "step": 2640, "text_loss": 0.27059751749038696 @@ -25097,13 +25097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008941747014988453, - "loss": 0.0155, + "loss": 0.0156, "macro_f1": 0.3333333432674408, "num_tokens": 4257442.0, "repeat_count": 0.0, - "routers_loss": 0.008832095190882683, + "routers_loss": 0.009030844084918499, "skip_count": 0.0, "step": 2642, "text_loss": 0.36747801303863525 @@ -25116,13 +25116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.080078125, + "grad_norm": 0.123046875, "learning_rate": 0.0008939842028273956, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4260386.0, "repeat_count": 0.0, - "routers_loss": 0.008952614851295948, + "routers_loss": 0.007844001986086369, "skip_count": 1.0, "step": 2644, "text_loss": 0.6397647857666016 @@ -25135,13 +25135,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.0283203125, "learning_rate": 0.0008937935531804562, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4263516.0, "repeat_count": 0.0, - "routers_loss": 0.0017659157747402787, + "routers_loss": 0.0018789108144119382, "skip_count": 0.0, "step": 2646, "text_loss": 0.4795534908771515 @@ -25154,13 +25154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.06494140625, "learning_rate": 0.0008936027526310844, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3272727429866791, "num_tokens": 4266744.0, "repeat_count": 0.0, - "routers_loss": 0.03944230079650879, + "routers_loss": 0.0348590686917305, "skip_count": 1.0, "step": 2648, "text_loss": 0.27691999077796936 @@ -25173,13 +25173,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.07275390625, "learning_rate": 0.000893411801252395, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4269766.0, "repeat_count": 0.0, - "routers_loss": 0.0037144431844353676, + "routers_loss": 0.004543309565633535, "skip_count": 1.0, "step": 2650, "text_loss": 0.18867231905460358 @@ -25192,13 +25192,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0008932206991175615, - "loss": 0.0143, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 4273513.0, "repeat_count": 0.0, - "routers_loss": 0.003659905167296529, + "routers_loss": 0.0035277456045150757, "skip_count": 1.0, "step": 2652, "text_loss": 0.45613357424736023 @@ -25211,13 +25211,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.055908203125, "learning_rate": 0.0008930294462998143, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4276878.0, "repeat_count": 1.0, - "routers_loss": 0.011676746420562267, + "routers_loss": 0.011337592266499996, "skip_count": 0.0, "step": 2654, "text_loss": 0.24733254313468933 @@ -25230,13 +25230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0008928380428724419, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4279915.0, "repeat_count": 0.0, - "routers_loss": 0.000998969655483961, + "routers_loss": 0.0010295971296727657, "skip_count": 1.0, "step": 2656, "text_loss": 0.41722849011421204 @@ -25249,13 +25249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.053955078125, "learning_rate": 0.0008926464889087903, - "loss": 0.0109, + "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4282888.0, "repeat_count": 0.0, - "routers_loss": 0.0016260759439319372, + "routers_loss": 0.0017198545392602682, "skip_count": 2.0, "step": 2658, "text_loss": 0.738322377204895 @@ -25268,13 +25268,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.068359375, "learning_rate": 0.0008924547844822634, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4285805.0, "repeat_count": 0.0, - "routers_loss": 0.0010900370543822646, + "routers_loss": 0.001339946174994111, "skip_count": 0.0, "step": 2660, "text_loss": 0.4802379906177521 @@ -25287,13 +25287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.05322265625, "learning_rate": 0.000892262929666323, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4290282.0, "repeat_count": 0.0, - "routers_loss": 0.002275131642818451, + "routers_loss": 0.0022340165451169014, "skip_count": 0.0, "step": 2662, "text_loss": 0.6503544449806213 @@ -25306,13 +25306,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.03662109375, "learning_rate": 0.0008920709245344878, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4294106.0, "repeat_count": 0.0, - "routers_loss": 0.00575100164860487, + "routers_loss": 0.005288850050419569, "skip_count": 1.0, "step": 2664, "text_loss": 0.12312037497758865 @@ -25325,13 +25325,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.041259765625, "learning_rate": 0.0008918787691603347, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 4298013.0, "repeat_count": 0.0, - "routers_loss": 0.004139711149036884, + "routers_loss": 0.004259659443050623, "skip_count": 1.0, "step": 2666, "text_loss": 0.3070000112056732 @@ -25344,13 +25344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04052734375, "learning_rate": 0.000891686463617498, - "loss": 0.0072, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4300799.0, "repeat_count": 0.0, - "routers_loss": 0.008856390602886677, + "routers_loss": 0.009489355608820915, "skip_count": 1.0, "step": 2668, "text_loss": 0.18535588681697845 @@ -25363,13 +25363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008914940079796696, - "loss": 0.0116, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4304641.0, "repeat_count": 0.0, - "routers_loss": 0.002438562922179699, + "routers_loss": 0.0025417013093829155, "skip_count": 0.0, "step": 2670, "text_loss": 0.482585072517395 @@ -25382,13 +25382,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008913014023205988, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4307462.0, "repeat_count": 0.0, - "routers_loss": 0.006435772404074669, + "routers_loss": 0.006371749565005302, "skip_count": 0.0, "step": 2672, "text_loss": 0.7064456939697266 @@ -25401,13 +25401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.039306640625, "learning_rate": 0.0008911086467140925, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4310396.0, "repeat_count": 0.0, - "routers_loss": 0.002773779444396496, + "routers_loss": 0.0027512952219694853, "skip_count": 0.0, "step": 2674, "text_loss": 0.23532851040363312 @@ -25420,13 +25420,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05712890625, "learning_rate": 0.000890915741234015, - "loss": 0.0135, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 4314781.0, "repeat_count": 0.0, - "routers_loss": 0.00862761028110981, + "routers_loss": 0.008253013715147972, "skip_count": 1.0, "step": 2676, "text_loss": 0.30950358510017395 @@ -25439,13 +25439,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.03173828125, "learning_rate": 0.0008907226859542879, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4317988.0, "repeat_count": 0.0, - "routers_loss": 0.005587176885455847, + "routers_loss": 0.005409995559602976, "skip_count": 2.0, "step": 2678, "text_loss": 0.4930732846260071 @@ -25458,13 +25458,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.060546875, "learning_rate": 0.0008905294809488907, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 4321014.0, "repeat_count": 1.0, - "routers_loss": 0.0033104203175753355, + "routers_loss": 0.0029942214023321867, "skip_count": 1.0, "step": 2680, "text_loss": 0.6224040389060974 @@ -25477,13 +25477,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08203125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008903361262918595, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4324268.0, "repeat_count": 0.0, - "routers_loss": 0.008205405436456203, + "routers_loss": 0.008411120623350143, "skip_count": 1.0, "step": 2682, "text_loss": 0.16296671330928802 @@ -25496,13 +25496,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052734375, + "grad_norm": 0.05126953125, "learning_rate": 0.0008901426220572884, - "loss": 0.0142, + "loss": 0.0138, "macro_f1": 1.0, "num_tokens": 4327494.0, "repeat_count": 2.0, - "routers_loss": 0.007884894497692585, + "routers_loss": 0.01039006095379591, "skip_count": 4.0, "step": 2684, "text_loss": 0.43866512179374695 @@ -25515,13 +25515,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.060791015625, "learning_rate": 0.0008899489683193286, - "loss": 0.011, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4330936.0, "repeat_count": 0.0, - "routers_loss": 0.0009336905204690993, + "routers_loss": 0.0009329111780971289, "skip_count": 0.0, "step": 2686, "text_loss": 0.44250962138175964 @@ -25534,13 +25534,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008897551651521885, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4334123.0, "repeat_count": 0.0, - "routers_loss": 0.0033622782211750746, + "routers_loss": 0.003197216661646962, "skip_count": 0.0, "step": 2688, "text_loss": 0.48313501477241516 @@ -25553,13 +25553,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.09716796875, "learning_rate": 0.0008895612126301339, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 4337610.0, "repeat_count": 0.0, - "routers_loss": 0.0034563415683805943, + "routers_loss": 0.0033548236824572086, "skip_count": 0.0, "step": 2690, "text_loss": 0.4715327322483063 @@ -25572,13 +25572,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.051513671875, "learning_rate": 0.0008893671108274877, - "loss": 0.0115, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4341026.0, "repeat_count": 0.0, - "routers_loss": 0.0022277699317783117, + "routers_loss": 0.0024757643695920706, "skip_count": 0.0, "step": 2692, "text_loss": 0.43402785062789917 @@ -25591,13 +25591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.043212890625, "learning_rate": 0.0008891728598186302, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 4344422.0, "repeat_count": 0.0, - "routers_loss": 0.003892304375767708, + "routers_loss": 0.003317243419587612, "skip_count": 0.0, "step": 2694, "text_loss": 0.8498559594154358 @@ -25610,13 +25610,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0380859375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008889784596779986, - "loss": 0.0092, + "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 4347507.0, "repeat_count": 0.0, - "routers_loss": 0.015058296732604504, + "routers_loss": 0.01577926240861416, "skip_count": 3.0, "step": 2696, "text_loss": 0.5646669864654541 @@ -25629,13 +25629,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.11328125, "learning_rate": 0.0008887839104800876, - "loss": 0.0118, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4350414.0, "repeat_count": 0.0, - "routers_loss": 0.0033561652526259422, + "routers_loss": 0.002953822258859873, "skip_count": 0.0, "step": 2698, "text_loss": 0.5145012140274048 @@ -25648,13 +25648,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.05029296875, "learning_rate": 0.0008885892122994486, - "loss": 0.0116, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4354110.0, "repeat_count": 0.0, - "routers_loss": 0.0062471418641507626, + "routers_loss": 0.005849295295774937, "skip_count": 0.0, "step": 2700, "text_loss": 0.580982506275177 @@ -25667,13 +25667,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.0419921875, "learning_rate": 0.0008883943652106903, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 4357323.0, "repeat_count": 1.0, - "routers_loss": 0.011802209541201591, + "routers_loss": 0.012347398325800896, "skip_count": 2.0, "step": 2702, "text_loss": 0.2234988808631897 @@ -25686,13 +25686,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.0008881993692884787, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 4360228.0, "repeat_count": 0.0, - "routers_loss": 0.0041528744623064995, + "routers_loss": 0.003574999049305916, "skip_count": 1.0, "step": 2704, "text_loss": 0.4261806607246399 @@ -25705,13 +25705,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.048828125, "learning_rate": 0.0008880042246075365, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4363905.0, "repeat_count": 0.0, - "routers_loss": 0.003151095937937498, + "routers_loss": 0.0031574300955981016, "skip_count": 0.0, "step": 2706, "text_loss": 0.691118061542511 @@ -25724,13 +25724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008878089312426433, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4366736.0, "repeat_count": 0.0, - "routers_loss": 0.003142676781862974, + "routers_loss": 0.003195564029738307, "skip_count": 0.0, "step": 2708, "text_loss": 0.613926112651825 @@ -25743,13 +25743,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.75, - "grad_norm": 0.05859375, + "grad_norm": 0.054443359375, "learning_rate": 0.0008876134892686363, "loss": 0.011, "macro_f1": 0.5694444179534912, "num_tokens": 4370146.0, "repeat_count": 0.0, - "routers_loss": 0.032964516431093216, + "routers_loss": 0.038784291595220566, "skip_count": 5.0, "step": 2710, "text_loss": 0.2723451852798462 @@ -25762,13 +25762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0830078125, "learning_rate": 0.000887417898760409, - "loss": 0.0123, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 4373653.0, "repeat_count": 0.0, - "routers_loss": 0.0006848900229670107, + "routers_loss": 0.0006457131239585578, "skip_count": 0.0, "step": 2712, "text_loss": 0.31667640805244446 @@ -25781,13 +25781,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.10498046875, "learning_rate": 0.000887222159792912, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6603773832321167, "num_tokens": 4376993.0, "repeat_count": 1.0, - "routers_loss": 0.04388813674449921, + "routers_loss": 0.045078590512275696, "skip_count": 1.0, "step": 2714, "text_loss": 0.5872798562049866 @@ -25800,13 +25800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.034912109375, "learning_rate": 0.0008870262724411528, - "loss": 0.0122, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4380160.0, "repeat_count": 0.0, - "routers_loss": 0.003538437420502305, + "routers_loss": 0.003628545207902789, "skip_count": 0.0, "step": 2716, "text_loss": 0.7468157410621643 @@ -25819,13 +25819,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1328125, + "grad_norm": 0.11181640625, "learning_rate": 0.0008868302367801962, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.6598639488220215, "num_tokens": 4383100.0, "repeat_count": 1.0, - "routers_loss": 0.05479869619011879, + "routers_loss": 0.05404464527964592, "skip_count": 3.0, "step": 2718, "text_loss": 0.2970244884490967 @@ -25838,13 +25838,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008866340528851629, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4386700.0, "repeat_count": 0.0, - "routers_loss": 0.0070296903140842915, + "routers_loss": 0.007000274024903774, "skip_count": 0.0, "step": 2720, "text_loss": 0.34521186351776123 @@ -25857,13 +25857,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05810546875, + "grad_norm": 0.052978515625, "learning_rate": 0.0008864377208312313, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.8823530077934265, "num_tokens": 4390299.0, "repeat_count": 1.0, - "routers_loss": 0.02051853947341442, + "routers_loss": 0.02025366574525833, "skip_count": 2.0, "step": 2722, "text_loss": 1.0536936521530151 @@ -25876,13 +25876,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.04638671875, "learning_rate": 0.000886241240693636, - "loss": 0.0096, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 4393353.0, "repeat_count": 0.0, - "routers_loss": 0.002662461483851075, + "routers_loss": 0.00251673418097198, "skip_count": 0.0, "step": 2724, "text_loss": 0.5678093433380127 @@ -25895,13 +25895,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.052001953125, "learning_rate": 0.0008860446125476686, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 4396446.0, "repeat_count": 1.0, - "routers_loss": 0.009321866557002068, + "routers_loss": 0.009532532654702663, "skip_count": 0.0, "step": 2726, "text_loss": 0.23775041103363037 @@ -25914,13 +25914,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.091796875, "learning_rate": 0.0008858478364686776, - "loss": 0.0102, + "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 4399977.0, "repeat_count": 1.0, - "routers_loss": 0.01029124017804861, + "routers_loss": 0.008062181062996387, "skip_count": 0.0, "step": 2728, "text_loss": 0.18888695538043976 @@ -25933,13 +25933,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.035888671875, "learning_rate": 0.0008856509125320678, - "loss": 0.0082, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4404406.0, "repeat_count": 0.0, - "routers_loss": 0.0008023424888961017, + "routers_loss": 0.0007731119985692203, "skip_count": 0.0, "step": 2730, "text_loss": 0.47331541776657104 @@ -25952,13 +25952,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.0498046875, "learning_rate": 0.0008854538408133006, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 4407165.0, "repeat_count": 0.0, - "routers_loss": 0.003058656118810177, + "routers_loss": 0.003115242812782526, "skip_count": 1.0, "step": 2732, "text_loss": 0.491370290517807 @@ -25971,13 +25971,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008852566213878947, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4410101.0, "repeat_count": 0.0, - "routers_loss": 0.0010282890871167183, + "routers_loss": 0.0008958528051152825, "skip_count": 0.0, "step": 2734, "text_loss": 0.42188262939453125 @@ -25990,13 +25990,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.07421875, + "grad_norm": 0.07763671875, "learning_rate": 0.0008850592543314246, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 1.0, "num_tokens": 4413015.0, "repeat_count": 1.0, - "routers_loss": 0.014785367995500565, + "routers_loss": 0.01139112375676632, "skip_count": 1.0, "step": 2736, "text_loss": 0.4716498553752899 @@ -26009,13 +26009,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0576171875, "learning_rate": 0.0008848617397195218, - "loss": 0.0089, + "loss": 0.0084, "macro_f1": 0.6603773832321167, "num_tokens": 4416404.0, "repeat_count": 1.0, - "routers_loss": 0.017717093229293823, + "routers_loss": 0.01609630137681961, "skip_count": 1.0, "step": 2738, "text_loss": 0.19490821659564972 @@ -26028,13 +26028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.041015625, "learning_rate": 0.0008846640776278745, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4419408.0, "repeat_count": 0.0, - "routers_loss": 0.0011861984385177493, + "routers_loss": 0.001489170710556209, "skip_count": 0.0, "step": 2740, "text_loss": 0.6443108320236206 @@ -26047,13 +26047,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0693359375, "learning_rate": 0.0008844662681322269, "loss": 0.0144, "macro_f1": 0.6666666865348816, "num_tokens": 4422067.0, "repeat_count": 1.0, - "routers_loss": 0.0013843412743881345, + "routers_loss": 0.0014755792217329144, "skip_count": 0.0, "step": 2742, "text_loss": 0.9150356650352478 @@ -26068,11 +26068,11 @@ "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0008842683113083801, - "loss": 0.0154, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 4425647.0, "repeat_count": 0.0, - "routers_loss": 0.010318896733224392, + "routers_loss": 0.008962674997746944, "skip_count": 1.0, "step": 2744, "text_loss": 0.7103227972984314 @@ -26085,13 +26085,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0751953125, "learning_rate": 0.0008840702072321915, - "loss": 0.0108, + "loss": 0.0104, "macro_f1": 0.6598639488220215, "num_tokens": 4428855.0, "repeat_count": 1.0, - "routers_loss": 0.029359478503465652, + "routers_loss": 0.02554207295179367, "skip_count": 3.0, "step": 2746, "text_loss": 0.27141591906547546 @@ -26104,13 +26104,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0234375, + "grad_norm": 0.0230712890625, "learning_rate": 0.0008838719559795751, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4432838.0, "repeat_count": 0.0, - "routers_loss": 0.0014995118835940957, + "routers_loss": 0.0011747616808861494, "skip_count": 0.0, "step": 2748, "text_loss": 0.4007738530635834 @@ -26123,13 +26123,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.03515625, + "grad_norm": 0.03466796875, "learning_rate": 0.0008836735576265009, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.5492662787437439, "num_tokens": 4435793.0, "repeat_count": 0.0, - "routers_loss": 0.017950648441910744, + "routers_loss": 0.017564335837960243, "skip_count": 2.0, "step": 2750, "text_loss": 0.5972410440444946 @@ -26142,13 +26142,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.044921875, "learning_rate": 0.0008834750122489956, - "loss": 0.0083, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 4438871.0, "repeat_count": 1.0, - "routers_loss": 0.0069067892618477345, + "routers_loss": 0.007004009559750557, "skip_count": 0.0, "step": 2752, "text_loss": 0.2294853925704956 @@ -26161,13 +26161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.06640625, "learning_rate": 0.0008832763199231423, - "loss": 0.0101, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4441846.0, "repeat_count": 0.0, - "routers_loss": 0.0013944554375484586, + "routers_loss": 0.0014562139986082911, "skip_count": 0.0, "step": 2754, "text_loss": 0.722432017326355 @@ -26180,13 +26180,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0751953125, "learning_rate": 0.0008830774807250802, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 4444786.0, "repeat_count": 1.0, - "routers_loss": 0.025158623233437538, + "routers_loss": 0.024773593991994858, "skip_count": 0.0, "step": 2756, "text_loss": 0.507905125617981 @@ -26199,13 +26199,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05419921875, + "grad_norm": 0.049072265625, "learning_rate": 0.0008828784947310049, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 0.8823530077934265, "num_tokens": 4448442.0, "repeat_count": 1.0, - "routers_loss": 0.05205477401614189, + "routers_loss": 0.04959975928068161, "skip_count": 2.0, "step": 2758, "text_loss": 0.3617522418498993 @@ -26218,13 +26218,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1025390625, "learning_rate": 0.000882679362017168, "loss": 0.0149, "macro_f1": 1.0, "num_tokens": 4451401.0, "repeat_count": 1.0, - "routers_loss": 0.005898742936551571, + "routers_loss": 0.005783245898783207, "skip_count": 2.0, "step": 2760, "text_loss": 0.49187400937080383 @@ -26237,13 +26237,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0008824800826598778, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 4454537.0, "repeat_count": 0.0, - "routers_loss": 0.006758298724889755, + "routers_loss": 0.00656260596588254, "skip_count": 0.0, "step": 2762, "text_loss": 0.6823583245277405 @@ -26256,13 +26256,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.0546875, "learning_rate": 0.0008822806567354983, - "loss": 0.0109, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4457706.0, "repeat_count": 1.0, - "routers_loss": 0.005730919074267149, + "routers_loss": 0.005298966076225042, "skip_count": 0.0, "step": 2764, "text_loss": 0.554322361946106 @@ -26275,13 +26275,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.046630859375, "learning_rate": 0.0008820810843204501, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3272727429866791, "num_tokens": 4460710.0, "repeat_count": 0.0, - "routers_loss": 0.03390989825129509, + "routers_loss": 0.03164982795715332, "skip_count": 1.0, "step": 2766, "text_loss": 0.1656961441040039 @@ -26294,13 +26294,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.072265625, "learning_rate": 0.0008818813654912095, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4464001.0, "repeat_count": 0.0, - "routers_loss": 0.0007058497285470366, + "routers_loss": 0.000715116853825748, "skip_count": 0.0, "step": 2768, "text_loss": 0.5818144083023071 @@ -26313,13 +26313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.056396484375, "learning_rate": 0.0008816815003243093, - "loss": 0.0136, + "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 4467364.0, "repeat_count": 0.0, - "routers_loss": 0.0027468691114336252, + "routers_loss": 0.002851625671610236, "skip_count": 0.0, "step": 2770, "text_loss": 0.6068631410598755 @@ -26332,13 +26332,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033203125, "learning_rate": 0.0008814814888963383, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4470681.0, "repeat_count": 0.0, - "routers_loss": 0.00443003186956048, + "routers_loss": 0.004729873035103083, "skip_count": 1.0, "step": 2772, "text_loss": 0.5386646389961243 @@ -26351,13 +26351,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04296875, "learning_rate": 0.000881281331283941, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4473734.0, "repeat_count": 0.0, - "routers_loss": 0.0031219064258038998, + "routers_loss": 0.0031853127293288708, "skip_count": 1.0, "step": 2774, "text_loss": 0.5695263147354126 @@ -26370,13 +26370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.033447265625, "learning_rate": 0.0008810810275638182, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4478404.0, "repeat_count": 0.0, - "routers_loss": 0.000846695271320641, + "routers_loss": 0.0008977465913631022, "skip_count": 0.0, "step": 2776, "text_loss": 0.4750773310661316 @@ -26389,13 +26389,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008808805778127269, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4481287.0, "repeat_count": 0.0, - "routers_loss": 0.0074167875573039055, + "routers_loss": 0.00469845999032259, "skip_count": 0.0, "step": 2778, "text_loss": 0.14078612625598907 @@ -26408,13 +26408,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04296875, + "grad_norm": 0.049560546875, "learning_rate": 0.0008806799821074796, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 4483929.0, "repeat_count": 0.0, - "routers_loss": 0.018358726054430008, + "routers_loss": 0.01789761893451214, "skip_count": 2.0, "step": 2780, "text_loss": 0.2167191207408905 @@ -26427,13 +26427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0008804792405249451, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 4487468.0, "repeat_count": 0.0, - "routers_loss": 0.001094152103178203, + "routers_loss": 0.001018838956952095, "skip_count": 0.0, "step": 2782, "text_loss": 0.5424665212631226 @@ -26446,13 +26446,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0498046875, + "grad_norm": 0.07373046875, "learning_rate": 0.000880278353142048, - "loss": 0.0075, + "loss": 0.0077, "macro_f1": 0.8200000524520874, "num_tokens": 4490942.0, "repeat_count": 1.0, - "routers_loss": 0.03035641834139824, + "routers_loss": 0.03260354697704315, "skip_count": 3.0, "step": 2784, "text_loss": 0.20994654297828674 @@ -26465,13 +26465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.05322265625, "learning_rate": 0.0008800773200357683, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4493986.0, "repeat_count": 0.0, - "routers_loss": 0.002394269686192274, + "routers_loss": 0.003019835101440549, "skip_count": 0.0, "step": 2786, "text_loss": 0.5709528923034668 @@ -26484,13 +26484,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.034423828125, "learning_rate": 0.0008798761412831429, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4498232.0, "repeat_count": 0.0, - "routers_loss": 0.0028274122159928083, + "routers_loss": 0.00285192858427763, "skip_count": 0.0, "step": 2788, "text_loss": 0.5103896260261536 @@ -26503,13 +26503,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.044921875, "learning_rate": 0.0008796748169612634, - "loss": 0.0088, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4501231.0, "repeat_count": 0.0, - "routers_loss": 0.0012642849469557405, + "routers_loss": 0.0012469831854104996, "skip_count": 0.0, "step": 2790, "text_loss": 0.43669697642326355 @@ -26522,13 +26522,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.039794921875, "learning_rate": 0.0008794733471472778, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4504208.0, "repeat_count": 0.0, - "routers_loss": 0.010966303758323193, + "routers_loss": 0.011512776836752892, "skip_count": 1.0, "step": 2792, "text_loss": 0.2299770563840866 @@ -26541,13 +26541,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.03564453125, "learning_rate": 0.0008792717319183899, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4507013.0, "repeat_count": 0.0, - "routers_loss": 0.008194026537239552, + "routers_loss": 0.00834917277097702, "skip_count": 0.0, "step": 2794, "text_loss": 0.2130603939294815 @@ -26560,13 +26560,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.03076171875, "learning_rate": 0.0008790699713518587, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4510286.0, "repeat_count": 0.0, - "routers_loss": 0.008828429505228996, + "routers_loss": 0.008616939187049866, "skip_count": 2.0, "step": 2796, "text_loss": 0.4377101957798004 @@ -26579,13 +26579,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.02783203125, "learning_rate": 0.0008788680655249994, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4513762.0, "repeat_count": 0.0, - "routers_loss": 0.0038230866193771362, + "routers_loss": 0.003408568911254406, "skip_count": 0.0, "step": 2798, "text_loss": 0.435138463973999 @@ -26598,13 +26598,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03369140625, "learning_rate": 0.0008786660145151826, - "loss": 0.009, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4516696.0, "repeat_count": 1.0, - "routers_loss": 0.0031088131945580244, + "routers_loss": 0.0029398901388049126, "skip_count": 0.0, "step": 2800, "text_loss": 0.3195655047893524 @@ -26617,13 +26617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.033203125, "learning_rate": 0.0008784638183998348, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4519760.0, "repeat_count": 0.0, - "routers_loss": 0.0014194221002981067, + "routers_loss": 0.0013777425047010183, "skip_count": 0.0, "step": 2802, "text_loss": 0.8129430413246155 @@ -26636,13 +26636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.032470703125, "learning_rate": 0.0008782614772564379, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4522106.0, "repeat_count": 0.0, - "routers_loss": 0.0031931858975440264, + "routers_loss": 0.0031694830395281315, "skip_count": 0.0, "step": 2804, "text_loss": 0.18083660304546356 @@ -26655,13 +26655,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.064453125, "learning_rate": 0.0008780589911625293, - "loss": 0.0117, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4525743.0, "repeat_count": 0.0, - "routers_loss": 0.0021834284998476505, + "routers_loss": 0.002161208540201187, "skip_count": 0.0, "step": 2806, "text_loss": 0.8228182792663574 @@ -26674,13 +26674,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0703125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008778563601957021, - "loss": 0.0098, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 4529573.0, "repeat_count": 0.0, - "routers_loss": 0.0035390176344662905, + "routers_loss": 0.0028444856870919466, "skip_count": 1.0, "step": 2808, "text_loss": 0.3715563118457794 @@ -26693,13 +26693,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008776535844336049, - "loss": 0.0095, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4532452.0, "repeat_count": 0.0, - "routers_loss": 0.0038604713045060635, + "routers_loss": 0.003807213855907321, "skip_count": 0.0, "step": 2810, "text_loss": 0.6012523174285889 @@ -26712,13 +26712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.0361328125, "learning_rate": 0.0008774506639539417, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4536077.0, "repeat_count": 0.0, - "routers_loss": 0.00669970503076911, + "routers_loss": 0.006698979996144772, "skip_count": 0.0, "step": 2812, "text_loss": 0.27097949385643005 @@ -26731,13 +26731,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0008772475988344722, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 4539057.0, "repeat_count": 0.0, - "routers_loss": 0.004594485275447369, + "routers_loss": 0.004849409218877554, "skip_count": 1.0, "step": 2814, "text_loss": 1.026973843574524 @@ -26750,13 +26750,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.04638671875, + "grad_norm": 0.041748046875, "learning_rate": 0.0008770443891530109, - "loss": 0.0116, + "loss": 0.0115, "macro_f1": 0.5934640765190125, "num_tokens": 4542253.0, "repeat_count": 0.0, - "routers_loss": 0.01891930215060711, + "routers_loss": 0.019148651510477066, "skip_count": 3.0, "step": 2816, "text_loss": 0.2717585563659668 @@ -26769,13 +26769,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052490234375, "learning_rate": 0.0008768410349874286, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 4545047.0, "repeat_count": 1.0, - "routers_loss": 0.0247862096875906, + "routers_loss": 0.02231316640973091, "skip_count": 2.0, "step": 2818, "text_loss": 0.274346262216568 @@ -26788,13 +26788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008766375364156508, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4548371.0, "repeat_count": 0.0, - "routers_loss": 0.008566800504922867, + "routers_loss": 0.008014129474759102, "skip_count": 2.0, "step": 2820, "text_loss": 0.22850871086120605 @@ -26807,13 +26807,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.044189453125, "learning_rate": 0.0008764338935156586, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4551276.0, "repeat_count": 0.0, - "routers_loss": 0.0013546474510803819, + "routers_loss": 0.0014544493751600385, "skip_count": 0.0, "step": 2822, "text_loss": 0.6308462023735046 @@ -26826,13 +26826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0390625, "learning_rate": 0.000876230106365488, - "loss": 0.0122, + "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 4554143.0, "repeat_count": 0.0, - "routers_loss": 0.009204468689858913, + "routers_loss": 0.00818584579974413, "skip_count": 3.0, "step": 2824, "text_loss": 0.3484207093715668 @@ -26845,13 +26845,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0264892578125, "learning_rate": 0.0008760261750432312, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4557256.0, "repeat_count": 0.0, - "routers_loss": 0.00787584763020277, + "routers_loss": 0.006275608204305172, "skip_count": 3.0, "step": 2826, "text_loss": 0.1927330046892166 @@ -26864,13 +26864,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0380859375, "learning_rate": 0.0008758220996270348, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 4560202.0, "repeat_count": 2.0, - "routers_loss": 0.0057869357988238335, + "routers_loss": 0.0055974251590669155, "skip_count": 2.0, "step": 2828, "text_loss": 0.7796496748924255 @@ -26883,13 +26883,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008756178801951007, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 4563508.0, "repeat_count": 0.0, - "routers_loss": 0.0018274546600878239, + "routers_loss": 0.0019799957517534494, "skip_count": 0.0, "step": 2830, "text_loss": 0.49633297324180603 @@ -26902,13 +26902,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0458984375, "learning_rate": 0.0008754135168256865, - "loss": 0.0094, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4566776.0, "repeat_count": 0.0, - "routers_loss": 0.004527154844254255, + "routers_loss": 0.004538947716355324, "skip_count": 0.0, "step": 2832, "text_loss": 0.5346745252609253 @@ -26921,13 +26921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03857421875, "learning_rate": 0.0008752090095971044, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4569787.0, "repeat_count": 0.0, - "routers_loss": 0.0018263199599459767, + "routers_loss": 0.001663343166001141, "skip_count": 0.0, "step": 2834, "text_loss": 0.5524004697799683 @@ -26940,13 +26940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.07373046875, "learning_rate": 0.000875004358587722, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4572813.0, "repeat_count": 0.0, - "routers_loss": 0.0022649941965937614, + "routers_loss": 0.0022988212294876575, "skip_count": 0.0, "step": 2836, "text_loss": 0.4232870042324066 @@ -26959,13 +26959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038330078125, "learning_rate": 0.000874799563875962, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4575563.0, "repeat_count": 0.0, - "routers_loss": 0.00791149027645588, + "routers_loss": 0.007781553082168102, "skip_count": 1.0, "step": 2838, "text_loss": 0.19239822030067444 @@ -26978,13 +26978,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0341796875, + "grad_norm": 0.03515625, "learning_rate": 0.0008745946255403021, "loss": 0.0072, "macro_f1": 0.5492662787437439, "num_tokens": 4578117.0, "repeat_count": 0.0, - "routers_loss": 0.016813624650239944, + "routers_loss": 0.01872488670051098, "skip_count": 2.0, "step": 2840, "text_loss": 0.2148810178041458 @@ -26999,11 +26999,11 @@ "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008743895436592749, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 4582330.0, "repeat_count": 1.0, - "routers_loss": 0.004429332446306944, + "routers_loss": 0.005634195636957884, "skip_count": 1.0, "step": 2842, "text_loss": 0.4929640591144562 @@ -27016,13 +27016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.048583984375, "learning_rate": 0.0008741843183114685, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4585765.0, "repeat_count": 0.0, - "routers_loss": 0.0007147722644731402, + "routers_loss": 0.0008928569150157273, "skip_count": 0.0, "step": 2844, "text_loss": 0.32702967524528503 @@ -27035,13 +27035,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044189453125, + "grad_norm": 0.0439453125, "learning_rate": 0.0008739789495755253, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4589000.0, "repeat_count": 0.0, - "routers_loss": 0.015438012778759003, + "routers_loss": 0.014715569093823433, "skip_count": 4.0, "step": 2846, "text_loss": 0.25125816464424133 @@ -27054,13 +27054,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.049560546875, "learning_rate": 0.0008737734375301433, - "loss": 0.0138, + "loss": 0.0135, "macro_f1": 0.3333333432674408, "num_tokens": 4592391.0, "repeat_count": 0.0, - "routers_loss": 0.0015892626252025366, + "routers_loss": 0.0017551190685480833, "skip_count": 0.0, "step": 2848, "text_loss": 0.6595172882080078 @@ -27073,13 +27073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.027099609375, "learning_rate": 0.0008735677822540749, - "loss": 0.0086, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4596662.0, "repeat_count": 0.0, - "routers_loss": 0.0006934175617061555, + "routers_loss": 0.0006456313421949744, "skip_count": 0.0, "step": 2850, "text_loss": 0.6290773153305054 @@ -27092,13 +27092,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.036865234375, "learning_rate": 0.0008733619838261276, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4599682.0, "repeat_count": 0.0, - "routers_loss": 0.006811433006078005, + "routers_loss": 0.00765060493722558, "skip_count": 2.0, "step": 2852, "text_loss": 0.3268161416053772 @@ -27111,13 +27111,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.044921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008731560423251637, - "loss": 0.0104, + "loss": 0.01, "macro_f1": 1.0, "num_tokens": 4603324.0, "repeat_count": 1.0, - "routers_loss": 0.012574959546327591, + "routers_loss": 0.01161442045122385, "skip_count": 2.0, "step": 2854, "text_loss": 0.3029932975769043 @@ -27130,13 +27130,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.038818359375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008729499578301005, "loss": 0.0098, "macro_f1": 0.9555556178092957, "num_tokens": 4606975.0, "repeat_count": 1.0, - "routers_loss": 0.01913273334503174, + "routers_loss": 0.02055389992892742, "skip_count": 5.0, "step": 2856, "text_loss": 0.6268532872200012 @@ -27149,13 +27149,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.05078125, "learning_rate": 0.00087274373041991, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4609629.0, "repeat_count": 0.0, - "routers_loss": 0.0012737065553665161, + "routers_loss": 0.0013911726418882608, "skip_count": 0.0, "step": 2858, "text_loss": 0.534355640411377 @@ -27168,13 +27168,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.053955078125, "learning_rate": 0.0008725373601736188, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4612913.0, "repeat_count": 2.0, - "routers_loss": 0.009088932536542416, + "routers_loss": 0.01010701060295105, "skip_count": 0.0, "step": 2860, "text_loss": 0.3391380310058594 @@ -27187,13 +27187,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.0255126953125, "learning_rate": 0.0008723308471703085, - "loss": 0.0078, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4616718.0, "repeat_count": 0.0, - "routers_loss": 0.006364458240568638, + "routers_loss": 0.005969462916254997, "skip_count": 1.0, "step": 2862, "text_loss": 0.47250816226005554 @@ -27206,13 +27206,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.046630859375, "learning_rate": 0.0008721241914891152, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4619680.0, "repeat_count": 0.0, - "routers_loss": 0.002686808817088604, + "routers_loss": 0.0027780034579336643, "skip_count": 0.0, "step": 2864, "text_loss": 0.3249278664588928 @@ -27225,13 +27225,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008719173932092295, - "loss": 0.0047, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4622700.0, "repeat_count": 0.0, - "routers_loss": 0.0018892486114054918, + "routers_loss": 0.0015912104863673449, "skip_count": 0.0, "step": 2866, "text_loss": 0.7789985537528992 @@ -27244,13 +27244,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.05126953125, "learning_rate": 0.0008717104524098973, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4626637.0, "repeat_count": 0.0, - "routers_loss": 0.0035258810967206955, + "routers_loss": 0.0036539011634886265, "skip_count": 0.0, "step": 2868, "text_loss": 0.619088351726532 @@ -27263,13 +27263,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10400390625, "learning_rate": 0.0008715033691704187, - "loss": 0.0121, + "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 4629863.0, "repeat_count": 0.0, - "routers_loss": 0.007305602077394724, + "routers_loss": 0.008402476087212563, "skip_count": 1.0, "step": 2870, "text_loss": 0.5550018548965454 @@ -27282,13 +27282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06298828125, "learning_rate": 0.0008712961435701479, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 4632657.0, "repeat_count": 0.0, - "routers_loss": 0.012898211367428303, + "routers_loss": 0.01400839351117611, "skip_count": 1.0, "step": 2872, "text_loss": 0.17368625104427338 @@ -27301,13 +27301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008710887756884947, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4635885.0, "repeat_count": 0.0, - "routers_loss": 0.0013437134912237525, + "routers_loss": 0.0014573842054232955, "skip_count": 0.0, "step": 2874, "text_loss": 0.5138643383979797 @@ -27320,13 +27320,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.033447265625, "learning_rate": 0.0008708812656049225, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 4639341.0, "repeat_count": 0.0, - "routers_loss": 0.002090727211907506, + "routers_loss": 0.002810224425047636, "skip_count": 1.0, "step": 2876, "text_loss": 0.70310378074646 @@ -27341,11 +27341,11 @@ "f1_skip": 0.8571428656578064, "grad_norm": 0.03564453125, "learning_rate": 0.0008706736133989497, - "loss": 0.0107, + "loss": 0.0105, "macro_f1": 0.9449735879898071, "num_tokens": 4642163.0, "repeat_count": 2.0, - "routers_loss": 0.030176319181919098, + "routers_loss": 0.029783209785819054, "skip_count": 4.0, "step": 2878, "text_loss": 0.26898008584976196 @@ -27358,13 +27358,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04150390625, "learning_rate": 0.0008704658191501491, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4645858.0, "repeat_count": 0.0, - "routers_loss": 0.0009633690933696926, + "routers_loss": 0.0009193966398015618, "skip_count": 0.0, "step": 2880, "text_loss": 0.6047570705413818 @@ -27377,13 +27377,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.060302734375, + "grad_norm": 0.05908203125, "learning_rate": 0.0008702578829381475, "loss": 0.0131, "macro_f1": 0.8814815282821655, "num_tokens": 4649237.0, "repeat_count": 2.0, - "routers_loss": 0.0568491593003273, + "routers_loss": 0.05698608607053757, "skip_count": 4.0, "step": 2882, "text_loss": 0.10695219784975052 @@ -27396,13 +27396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.0311279296875, "learning_rate": 0.0008700498048426269, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4652362.0, "repeat_count": 0.0, - "routers_loss": 0.0012279651127755642, + "routers_loss": 0.0011786938412114978, "skip_count": 0.0, "step": 2884, "text_loss": 0.4442957937717438 @@ -27415,13 +27415,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008698415849433229, - "loss": 0.0097, + "loss": 0.0092, "macro_f1": 0.5492662787437439, "num_tokens": 4655616.0, "repeat_count": 2.0, - "routers_loss": 0.02166076935827732, + "routers_loss": 0.02142646163702011, "skip_count": 0.0, "step": 2886, "text_loss": 0.5820964574813843 @@ -27434,13 +27434,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008696332233200262, - "loss": 0.012, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4659294.0, "repeat_count": 0.0, - "routers_loss": 0.003944257274270058, + "routers_loss": 0.004038636106997728, "skip_count": 0.0, "step": 2888, "text_loss": 0.11847645789384842 @@ -27453,13 +27453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008694247200525806, - "loss": 0.0092, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4662512.0, "repeat_count": 0.0, - "routers_loss": 0.0013393335975706577, + "routers_loss": 0.0013256469974294305, "skip_count": 0.0, "step": 2890, "text_loss": 0.4873582720756531 @@ -27472,13 +27472,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008692160752208856, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3272727429866791, "num_tokens": 4666190.0, "repeat_count": 0.0, - "routers_loss": 0.0443510003387928, + "routers_loss": 0.04477972164750099, "skip_count": 1.0, "step": 2892, "text_loss": 0.44243401288986206 @@ -27491,13 +27491,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.083984375, + "grad_norm": 0.09521484375, "learning_rate": 0.0008690072889048941, - "loss": 0.0125, + "loss": 0.0127, "macro_f1": 1.0, "num_tokens": 4668884.0, "repeat_count": 1.0, - "routers_loss": 0.0047337980940938, + "routers_loss": 0.004407547414302826, "skip_count": 2.0, "step": 2894, "text_loss": 0.6847127079963684 @@ -27510,13 +27510,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.04052734375, "learning_rate": 0.0008687983611846133, - "loss": 0.0082, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4672093.0, "repeat_count": 0.0, - "routers_loss": 0.0055244253017008305, + "routers_loss": 0.005245382897555828, "skip_count": 1.0, "step": 2896, "text_loss": 0.25583332777023315 @@ -27529,13 +27529,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.0458984375, "learning_rate": 0.0008685892921401049, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4674917.0, "repeat_count": 0.0, - "routers_loss": 0.001250729663297534, + "routers_loss": 0.0010470855049788952, "skip_count": 0.0, "step": 2898, "text_loss": 0.41998377442359924 @@ -27548,13 +27548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.037841796875, "learning_rate": 0.0008683800818514844, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4677739.0, "repeat_count": 0.0, - "routers_loss": 0.00974183902144432, + "routers_loss": 0.009026622399687767, "skip_count": 2.0, "step": 2900, "text_loss": 0.303053081035614 @@ -27567,13 +27567,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.09619140625, "learning_rate": 0.0008681707303989215, - "loss": 0.0111, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4680721.0, "repeat_count": 0.0, - "routers_loss": 0.004882345907390118, + "routers_loss": 0.004500916693359613, "skip_count": 0.0, "step": 2902, "text_loss": 0.5573288798332214 @@ -27586,13 +27586,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008679612378626404, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4683339.0, "repeat_count": 0.0, - "routers_loss": 0.00568242697045207, + "routers_loss": 0.005047840531915426, "skip_count": 1.0, "step": 2904, "text_loss": 0.321353554725647 @@ -27605,13 +27605,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.03271484375, "learning_rate": 0.0008677516043229187, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4686453.0, "repeat_count": 0.0, - "routers_loss": 0.010831202380359173, + "routers_loss": 0.010256914421916008, "skip_count": 1.0, "step": 2906, "text_loss": 0.4300784468650818 @@ -27624,13 +27624,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0008675418298600883, - "loss": 0.0087, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4689645.0, "repeat_count": 1.0, - "routers_loss": 0.00235295994207263, + "routers_loss": 0.0022669637110084295, "skip_count": 0.0, "step": 2908, "text_loss": 0.5064885020256042 @@ -27643,13 +27643,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.048828125, "learning_rate": 0.0008673319145545358, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4692320.0, "repeat_count": 0.0, - "routers_loss": 0.0011642680037766695, + "routers_loss": 0.0011188550852239132, "skip_count": 0.0, "step": 2910, "text_loss": 0.7114819884300232 @@ -27662,13 +27662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03369140625, "learning_rate": 0.0008671218584867003, - "loss": 0.0104, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4695116.0, "repeat_count": 0.0, - "routers_loss": 0.00278888875618577, + "routers_loss": 0.002966561820358038, "skip_count": 2.0, "step": 2912, "text_loss": 0.5662392973899841 @@ -27681,13 +27681,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.047607421875, "learning_rate": 0.0008669116617370762, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4698040.0, "repeat_count": 0.0, - "routers_loss": 0.0014630162622779608, + "routers_loss": 0.0012894890969619155, "skip_count": 0.0, "step": 2914, "text_loss": 0.718977689743042 @@ -27700,13 +27700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.1552734375, "learning_rate": 0.0008667013243862111, - "loss": 0.0159, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4700963.0, "repeat_count": 0.0, - "routers_loss": 0.0011393720051273704, + "routers_loss": 0.0007232456118799746, "skip_count": 0.0, "step": 2916, "text_loss": 0.3447718024253845 @@ -27719,13 +27719,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02978515625, + "grad_norm": 0.0289306640625, "learning_rate": 0.000866490846514707, - "loss": 0.0072, + "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 4704471.0, "repeat_count": 1.0, - "routers_loss": 0.014218449592590332, + "routers_loss": 0.015166680328547955, "skip_count": 0.0, "step": 2918, "text_loss": 0.454946368932724 @@ -27738,13 +27738,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.04736328125, "learning_rate": 0.000866280228203219, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4707238.0, "repeat_count": 1.0, - "routers_loss": 0.005367610137909651, + "routers_loss": 0.0061312485486269, "skip_count": 1.0, "step": 2920, "text_loss": 0.721788227558136 @@ -27757,13 +27757,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048828125, + "grad_norm": 0.055908203125, "learning_rate": 0.0008660694695324564, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4711323.0, "repeat_count": 0.0, - "routers_loss": 0.0020303199999034405, + "routers_loss": 0.00169933564029634, "skip_count": 0.0, "step": 2922, "text_loss": 0.7562121748924255 @@ -27776,13 +27776,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0654296875, "learning_rate": 0.0008658585705831829, - "loss": 0.0123, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4714417.0, "repeat_count": 0.0, - "routers_loss": 0.0022230520844459534, + "routers_loss": 0.0022731393110007048, "skip_count": 0.0, "step": 2924, "text_loss": 0.5726147890090942 @@ -27795,13 +27795,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.068359375, "learning_rate": 0.0008656475314362148, - "loss": 0.0133, + "loss": 0.0131, "macro_f1": 0.8817967176437378, "num_tokens": 4717445.0, "repeat_count": 2.0, - "routers_loss": 0.06414645165205002, + "routers_loss": 0.06477782875299454, "skip_count": 3.0, "step": 2926, "text_loss": 0.4505867660045624 @@ -27814,13 +27814,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0008654363521724229, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.9449735879898071, "num_tokens": 4722253.0, "repeat_count": 2.0, - "routers_loss": 0.022727061063051224, + "routers_loss": 0.027405790984630585, "skip_count": 4.0, "step": 2928, "text_loss": 0.24767601490020752 @@ -27833,13 +27833,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.0537109375, "learning_rate": 0.0008652250328727315, - "loss": 0.0114, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4725465.0, "repeat_count": 0.0, - "routers_loss": 0.006181784905493259, + "routers_loss": 0.006544729229062796, "skip_count": 2.0, "step": 2930, "text_loss": 0.4478724002838135 @@ -27852,13 +27852,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0517578125, "learning_rate": 0.0008650135736181184, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4729213.0, "repeat_count": 1.0, - "routers_loss": 0.005527070257812738, + "routers_loss": 0.0055119614116847515, "skip_count": 0.0, "step": 2932, "text_loss": 0.6749323010444641 @@ -27871,13 +27871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.045166015625, "learning_rate": 0.0008648019744896154, - "loss": 0.0102, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4732280.0, "repeat_count": 0.0, - "routers_loss": 0.008868738077580929, + "routers_loss": 0.008374541997909546, "skip_count": 0.0, "step": 2934, "text_loss": 0.4647359251976013 @@ -27890,13 +27890,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.06201171875, "learning_rate": 0.0008645902355683077, - "loss": 0.0089, + "loss": 0.0091, "macro_f1": 0.6595745086669922, "num_tokens": 4736244.0, "repeat_count": 1.0, - "routers_loss": 0.07285884022712708, + "routers_loss": 0.068686343729496, "skip_count": 4.0, "step": 2936, "text_loss": 0.5356017351150513 @@ -27909,13 +27909,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.042236328125, "learning_rate": 0.0008643783569353339, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4739810.0, "repeat_count": 2.0, - "routers_loss": 0.019306030124425888, + "routers_loss": 0.017954571172595024, "skip_count": 0.0, "step": 2938, "text_loss": 0.3145926296710968 @@ -27928,13 +27928,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.054443359375, "learning_rate": 0.0008641663386718863, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4742720.0, "repeat_count": 0.0, - "routers_loss": 0.00626454409211874, + "routers_loss": 0.006261351052671671, "skip_count": 1.0, "step": 2940, "text_loss": 0.3200613856315613 @@ -27949,11 +27949,11 @@ "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008639541808592109, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 4745870.0, "repeat_count": 1.0, - "routers_loss": 0.0019172134343534708, + "routers_loss": 0.0025341357104480267, "skip_count": 1.0, "step": 2942, "text_loss": 0.5020416378974915 @@ -27968,11 +27968,11 @@ "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008637418835786067, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4748943.0, "repeat_count": 0.0, - "routers_loss": 0.009745351038873196, + "routers_loss": 0.008970048278570175, "skip_count": 2.0, "step": 2944, "text_loss": 0.14517110586166382 @@ -27985,13 +27985,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008635294469114265, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4751360.0, "repeat_count": 0.0, - "routers_loss": 0.0020624736789613962, + "routers_loss": 0.002133632078766823, "skip_count": 0.0, "step": 2946, "text_loss": 0.5367856025695801 @@ -28004,13 +28004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.08837890625, "learning_rate": 0.0008633168709390766, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4754403.0, "repeat_count": 0.0, - "routers_loss": 0.001082106726244092, + "routers_loss": 0.0011866620043292642, "skip_count": 0.0, "step": 2948, "text_loss": 0.38302522897720337 @@ -28023,13 +28023,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.037109375, "learning_rate": 0.0008631041557430163, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4757867.0, "repeat_count": 2.0, - "routers_loss": 0.0026527612935751677, + "routers_loss": 0.0026854004245251417, "skip_count": 0.0, "step": 2950, "text_loss": 0.43433454632759094 @@ -28042,13 +28042,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.05859375, "learning_rate": 0.0008628913014047585, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4761171.0, "repeat_count": 0.0, - "routers_loss": 0.0027245471719652414, + "routers_loss": 0.002433479530736804, "skip_count": 0.0, "step": 2952, "text_loss": 0.4725971519947052 @@ -28061,13 +28061,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.028564453125, "learning_rate": 0.0008626783080058696, - "loss": 0.0065, + "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 4764752.0, "repeat_count": 1.0, - "routers_loss": 0.01764744706451893, + "routers_loss": 0.017182493582367897, "skip_count": 0.0, "step": 2954, "text_loss": 0.460641473531723 @@ -28080,13 +28080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0008624651756279687, - "loss": 0.0196, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 4767453.0, "repeat_count": 0.0, - "routers_loss": 0.0019560824148356915, + "routers_loss": 0.0018134774873033166, "skip_count": 0.0, "step": 2956, "text_loss": 0.4091459810733795 @@ -28099,13 +28099,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.053466796875, "learning_rate": 0.000862251904352729, "loss": 0.0108, "macro_f1": 0.9259259104728699, "num_tokens": 4771110.0, "repeat_count": 3.0, - "routers_loss": 0.03031078353524208, + "routers_loss": 0.0365753099322319, "skip_count": 3.0, "step": 2958, "text_loss": 0.22408585250377655 @@ -28118,13 +28118,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.05029296875, "learning_rate": 0.000862038494261876, "loss": 0.0109, "macro_f1": 0.3272727429866791, "num_tokens": 4774464.0, "repeat_count": 0.0, - "routers_loss": 0.024790454655885696, + "routers_loss": 0.024343067780137062, "skip_count": 1.0, "step": 2960, "text_loss": 0.16483014822006226 @@ -28137,13 +28137,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008618249454371891, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 4777894.0, "repeat_count": 0.0, - "routers_loss": 0.0008704765350557864, + "routers_loss": 0.0008310087723657489, "skip_count": 0.0, "step": 2962, "text_loss": 0.5573428869247437 @@ -28156,13 +28156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008616112579605006, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4781116.0, "repeat_count": 0.0, - "routers_loss": 0.0066874073818326, + "routers_loss": 0.0065494864247739315, "skip_count": 0.0, "step": 2964, "text_loss": 0.18816794455051422 @@ -28175,13 +28175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.04248046875, "learning_rate": 0.0008613974319136957, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4784886.0, "repeat_count": 0.0, - "routers_loss": 0.0021798228845000267, + "routers_loss": 0.0019726944155991077, "skip_count": 0.0, "step": 2966, "text_loss": 0.5097305774688721 @@ -28194,13 +28194,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.0849609375, "learning_rate": 0.0008611834673787134, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4787563.0, "repeat_count": 0.0, - "routers_loss": 0.0063707553781569, + "routers_loss": 0.006327496841549873, "skip_count": 0.0, "step": 2968, "text_loss": 0.6953814029693604 @@ -28213,13 +28213,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.5, "f1_skip": 1.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.056884765625, "learning_rate": 0.0008609693644375449, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.8200000524520874, "num_tokens": 4790421.0, "repeat_count": 3.0, - "routers_loss": 0.044509731233119965, + "routers_loss": 0.042896661907434464, "skip_count": 1.0, "step": 2970, "text_loss": 0.2573051154613495 @@ -28227,18 +28227,18 @@ { "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 13.953331376577633, - "f1_execute": 0.9795917868614197, + "f1_execute": 1.0, "f1_repeat": 1.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1640625, + "f1_skip": 1.0, + "grad_norm": 0.14453125, "learning_rate": 0.000860755123172235, - "loss": 0.01, - "macro_f1": 0.8820862174034119, + "loss": 0.0096, + "macro_f1": 1.0, "num_tokens": 4793786.0, "repeat_count": 2.0, - "routers_loss": 0.01667599380016327, + "routers_loss": 0.013228793628513813, "skip_count": 1.0, "step": 2972, "text_loss": 0.46614497900009155 @@ -28251,13 +28251,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0296630859375, "learning_rate": 0.0008605407436648815, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4796864.0, "repeat_count": 0.0, - "routers_loss": 0.008433761075139046, + "routers_loss": 0.007294759154319763, "skip_count": 2.0, "step": 2974, "text_loss": 0.21555091440677643 @@ -28270,13 +28270,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.057861328125, "learning_rate": 0.0008603262259976348, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 1.0, "num_tokens": 4800080.0, "repeat_count": 1.0, - "routers_loss": 0.002439796691760421, + "routers_loss": 0.0024024227168411016, "skip_count": 5.0, "step": 2976, "text_loss": 0.7855485081672668 @@ -28289,13 +28289,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.07666015625, "learning_rate": 0.0008601115702526987, - "loss": 0.0112, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4802899.0, "repeat_count": 0.0, - "routers_loss": 0.0015027766348794103, + "routers_loss": 0.001433031284250319, "skip_count": 0.0, "step": 2978, "text_loss": 0.6777765154838562 @@ -28308,13 +28308,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.04931640625, "learning_rate": 0.0008598967765123293, - "loss": 0.0091, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4805835.0, "repeat_count": 0.0, - "routers_loss": 0.003235677955672145, + "routers_loss": 0.003073975909501314, "skip_count": 0.0, "step": 2980, "text_loss": 0.5926910638809204 @@ -28322,18 +28322,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.0, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052734375, + "grad_norm": 0.05322265625, "learning_rate": 0.0008596818448588364, - "loss": 0.0141, - "macro_f1": 0.7474747896194458, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, "num_tokens": 4809028.0, "repeat_count": 1.0, - "routers_loss": 0.063179150223732, + "routers_loss": 0.06438573449850082, "skip_count": 6.0, "step": 2982, "text_loss": 0.23975612223148346 @@ -28346,13 +28346,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0302734375, "learning_rate": 0.0008594667753745821, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 4812831.0, "repeat_count": 0.0, - "routers_loss": 0.015444152988493443, + "routers_loss": 0.014817612245678902, "skip_count": 1.0, "step": 2984, "text_loss": 0.17292268574237823 @@ -28365,13 +28365,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008592515681419813, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 0.5492662787437439, "num_tokens": 4816005.0, "repeat_count": 2.0, - "routers_loss": 0.02485196851193905, + "routers_loss": 0.025407327339053154, "skip_count": 0.0, "step": 2986, "text_loss": 0.6403061151504517 @@ -28384,13 +28384,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008590362232435018, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4818901.0, "repeat_count": 0.0, - "routers_loss": 0.006175600457936525, + "routers_loss": 0.006826757453382015, "skip_count": 0.0, "step": 2988, "text_loss": 0.2572069466114044 @@ -28403,13 +28403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.04052734375, "learning_rate": 0.0008588207407616644, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4823120.0, "repeat_count": 0.0, - "routers_loss": 0.0008576468680985272, + "routers_loss": 0.0009054148104041815, "skip_count": 0.0, "step": 2990, "text_loss": 0.4827076196670532 @@ -28422,13 +28422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02392578125, + "grad_norm": 0.0247802734375, "learning_rate": 0.0008586051207790422, - "loss": 0.0059, + "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4825774.0, "repeat_count": 0.0, - "routers_loss": 0.0011548360344022512, + "routers_loss": 0.0012294676853343844, "skip_count": 0.0, "step": 2992, "text_loss": 0.40157821774482727 @@ -28441,13 +28441,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.056396484375, + "grad_norm": 0.052734375, "learning_rate": 0.0008583893633782612, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 4828841.0, "repeat_count": 0.0, - "routers_loss": 0.01307896338403225, + "routers_loss": 0.011474622413516045, "skip_count": 2.0, "step": 2994, "text_loss": 0.14842072129249573 @@ -28460,13 +28460,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.058837890625, "learning_rate": 0.0008581734686419999, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4831458.0, "repeat_count": 0.0, - "routers_loss": 0.009716883301734924, + "routers_loss": 0.009154081344604492, "skip_count": 2.0, "step": 2996, "text_loss": 0.365400105714798 @@ -28479,13 +28479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.031982421875, "learning_rate": 0.00085795743665299, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4834609.0, "repeat_count": 0.0, - "routers_loss": 0.0026114562060683966, + "routers_loss": 0.002899336162954569, "skip_count": 0.0, "step": 2998, "text_loss": 0.5574684143066406 @@ -28498,13 +28498,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.0517578125, "learning_rate": 0.0008577412674940152, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4838324.0, "repeat_count": 0.0, - "routers_loss": 0.003787368768826127, + "routers_loss": 0.0034664268605411053, "skip_count": 0.0, "step": 3000, "text_loss": 0.6752855777740479 diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644 --- a/checkpoint-3000/training_args.bin +++ b/checkpoint-3000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 size 5880 diff --git a/checkpoint-4000/model-00002-of-00002.safetensors b/checkpoint-4000/model-00002-of-00002.safetensors index 538ae52a52776454580f9817a68b94d1d18e395f..3d224309e0a868d86d141bce673aeb8ef8112f3d 100644 --- a/checkpoint-4000/model-00002-of-00002.safetensors +++ b/checkpoint-4000/model-00002-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f665e64ef533a14606501a6daab770caac72a570b75afcb29c6765710e6f735 +oid sha256:1cc14c8bc6b81f0324d9ed1ed6b83526c997381eecbc191424a4bcf38ef3bbc2 size 1481790520 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt index 4a0a5e373e2f7ccaf38f047fc6ae528ff7d3d9a8..40f996bb17bb179c7ce09890432de5718894f71f 100644 --- a/checkpoint-4000/optimizer.pt +++ b/checkpoint-4000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32422466c15a2f6655c13d290e30cb4d1f0c57cb2c65fc6c873e0977bd171463 +oid sha256:938be4a6076a4306441bcc5f97aef5747a2cae3994dc47c31ec908bf5cfe80fc size 44191162 diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json index 556d13e8db3354de1db66eb08234bff04bd4ce19..6780c28acf81cf1fb2f2ab9f6c72f85ed4db76c4 100644 --- a/checkpoint-4000/trainer_state.json +++ b/checkpoint-4000/trainer_state.json @@ -12,18 +12,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 25.0, "epoch": 0.009392427355444672, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.40625, + "grad_norm": 2.25, "learning_rate": 2e-06, - "loss": 0.5484, - "macro_f1": 0.1621621698141098, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, - "routers_loss": 0.503563642501831, + "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 @@ -31,18 +31,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 23.0, "epoch": 0.018784854710889344, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.9140625, + "grad_norm": 1.8359375, "learning_rate": 6e-06, - "loss": 0.536, - "macro_f1": 0.1621621698141098, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, - "routers_loss": 0.4589468538761139, + "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 @@ -50,37 +50,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 28.0, "epoch": 0.02817728206633402, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.375, + "grad_norm": 2.234375, "learning_rate": 1e-05, - "loss": 0.5469, - "macro_f1": 0.19999998807907104, + "loss": 0.5113, + "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, - "routers_loss": 0.5736724138259888, + "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { - "acc_repeat": 1.0, - "acc_skip": 0.5, - "avg_layers": 33.0, + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 0.03756970942177869, - "f1_execute": 0.47058823704719543, - "f1_repeat": 0.1538461595773697, - "f1_skip": 0.222222238779068, - "grad_norm": 1.8515625, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, "learning_rate": 1.4e-05, - "loss": 0.5291, - "macro_f1": 0.28221890330314636, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, - "routers_loss": 0.49970296025276184, + "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 @@ -88,37 +88,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.046962136777223364, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.953125, + "grad_norm": 1.78125, "learning_rate": 1.8e-05, - "loss": 0.5316, - "macro_f1": 0.19999998807907104, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, - "routers_loss": 0.5153562426567078, + "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 34.0, + "avg_layers": 26.0, "epoch": 0.05635456413266804, - "f1_execute": 0.5714285373687744, - "f1_repeat": 0.0, - "f1_skip": 0.25, - "grad_norm": 1.6328125, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, "learning_rate": 2.2e-05, - "loss": 0.5051, - "macro_f1": 0.2738095223903656, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, - "routers_loss": 0.46214747428894043, + "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 @@ -126,37 +126,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.06574699148811271, - "f1_execute": 0.5263157486915588, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.6e-05, - "loss": 0.5653, - "macro_f1": 0.17543858289718628, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, - "routers_loss": 0.5300976634025574, + "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 34.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 0.07513941884355738, - "f1_execute": 0.6153846383094788, + "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 1.8828125, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, "learning_rate": 3e-05, - "loss": 0.5225, - "macro_f1": 0.20512822270393372, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, - "routers_loss": 0.473240464925766, + "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 @@ -164,18 +164,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 38.0, + "avg_layers": 27.0, "epoch": 0.08453184619900206, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.6015625, + "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4867, - "macro_f1": 0.19999998807907104, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, - "routers_loss": 0.4795944094657898, + "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 @@ -183,18 +183,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 36.0, + "avg_layers": 26.0, "epoch": 0.09392427355444673, - "f1_execute": 0.6153846383094788, - "f1_repeat": 0.1538461595773697, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, - "grad_norm": 1.3984375, + "grad_norm": 1.3125, "learning_rate": 3.8e-05, - "loss": 0.4718, - "macro_f1": 0.25641027092933655, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, - "routers_loss": 0.41872408986091614, + "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 @@ -202,18 +202,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 26.0, "epoch": 0.1033167009098914, - "f1_execute": 0.6341463327407837, + "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.7734375, + "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, - "loss": 0.4472, - "macro_f1": 0.21138212084770203, + "loss": 0.404, + "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, - "routers_loss": 0.4152105450630188, + "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 @@ -221,18 +221,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 27.0, "epoch": 0.11270912826533608, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.8046875, + "grad_norm": 1.6328125, "learning_rate": 4.6e-05, - "loss": 0.4554, - "macro_f1": 0.19999998807907104, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, - "routers_loss": 0.47541096806526184, + "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 @@ -240,18 +240,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 34.0, + "avg_layers": 27.0, "epoch": 0.12210155562078075, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.875, + "grad_norm": 1.7109375, "learning_rate": 5e-05, - "loss": 0.4182, - "macro_f1": 0.2608695924282074, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, - "routers_loss": 0.37319275736808777, + "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 @@ -259,18 +259,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.13149398297622542, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4375, + "grad_norm": 1.34375, "learning_rate": 5.4e-05, - "loss": 0.3991, - "macro_f1": 0.2608695924282074, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, - "routers_loss": 0.3604123294353485, + "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 @@ -280,16 +280,16 @@ "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, - "f1_execute": 0.8979591727256775, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.421875, + "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, - "loss": 0.3827, - "macro_f1": 0.2993197441101074, + "loss": 0.341, + "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, - "routers_loss": 0.35880225896835327, + "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 @@ -297,18 +297,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 0.15027883768711475, - "f1_execute": 0.9200000166893005, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 6.2e-05, - "loss": 0.3452, - "macro_f1": 0.30666667222976685, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, - "routers_loss": 0.31086465716362, + "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 @@ -316,18 +316,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.15967126504255943, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3671875, + "grad_norm": 1.46875, "learning_rate": 6.6e-05, - "loss": 0.3283, - "macro_f1": 0.3144654333591461, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, - "routers_loss": 0.2674171030521393, + "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 @@ -335,18 +335,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.16906369239800412, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1015625, + "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, - "loss": 0.2849, - "macro_f1": 0.3205128312110901, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, - "routers_loss": 0.24587315320968628, + "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 @@ -354,18 +354,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 27.0, "epoch": 0.17845611975344877, - "f1_execute": 0.8085106015205383, + "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3046875, + "grad_norm": 1.484375, "learning_rate": 7.4e-05, - "loss": 0.2616, - "macro_f1": 0.26950353384017944, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, - "routers_loss": 0.32050269842147827, + "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 @@ -373,18 +373,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.18784854710889345, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1796875, + "grad_norm": 1.3828125, "learning_rate": 7.8e-05, - "loss": 0.2084, - "macro_f1": 0.3144654333591461, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, - "routers_loss": 0.15196125209331512, + "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 @@ -392,18 +392,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.61328125, + "grad_norm": 0.78125, "learning_rate": 8.2e-05, - "loss": 0.1947, + "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, - "routers_loss": 0.14121046662330627, + "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 @@ -416,13 +416,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.50390625, + "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, - "loss": 0.1884, + "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, - "routers_loss": 0.21312278509140015, + "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 @@ -435,13 +435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.45703125, + "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, - "loss": 0.166, + "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, - "routers_loss": 0.1184137836098671, + "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 @@ -454,13 +454,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.62890625, + "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, - "loss": 0.1313, + "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, - "routers_loss": 0.10897563397884369, + "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 @@ -468,18 +468,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.2348106838861168, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.4375, + "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, - "loss": 0.1531, - "macro_f1": 0.3272727429866791, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, - "routers_loss": 0.09979952871799469, + "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 @@ -487,18 +487,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.2442031112415615, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.515625, + "grad_norm": 0.8515625, "learning_rate": 0.000102, - "loss": 0.1265, - "macro_f1": 0.3272727429866791, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, - "routers_loss": 0.05543195456266403, + "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 @@ -511,13 +511,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.328125, + "grad_norm": 0.421875, "learning_rate": 0.000106, - "loss": 0.1436, + "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, - "routers_loss": 0.15049344301223755, + "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 @@ -530,13 +530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.263671875, + "grad_norm": 0.35546875, "learning_rate": 0.00011, - "loss": 0.1021, + "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, - "routers_loss": 0.07367338240146637, + "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 @@ -544,18 +544,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 26.0, "epoch": 0.2723803933078955, - "f1_execute": 1.0, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25, + "grad_norm": 0.271484375, "learning_rate": 0.000114, - "loss": 0.114, - "macro_f1": 0.3333333432674408, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, - "routers_loss": 0.03782692551612854, + "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 @@ -568,13 +568,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.5390625, "learning_rate": 0.000118, - "loss": 0.1197, + "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, - "routers_loss": 0.14074955880641937, + "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 @@ -587,13 +587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2353515625, "learning_rate": 0.000122, - "loss": 0.1174, + "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, - "routers_loss": 0.058013737201690674, + "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 @@ -606,13 +606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.212890625, "learning_rate": 0.000126, - "loss": 0.0911, + "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, - "routers_loss": 0.04936821386218071, + "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 @@ -625,13 +625,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.220703125, + "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, - "loss": 0.1107, + "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, - "routers_loss": 0.2628525495529175, + "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 @@ -644,13 +644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000134, - "loss": 0.1109, + "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, - "routers_loss": 0.02859785594046116, + "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 @@ -663,13 +663,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, - "loss": 0.1067, + "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, - "routers_loss": 0.10459086298942566, + "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 @@ -682,13 +682,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2109375, + "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, - "loss": 0.1166, + "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, - "routers_loss": 0.0718551054596901, + "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 @@ -701,13 +701,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1787109375, "learning_rate": 0.000146, - "loss": 0.1007, + "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, - "routers_loss": 0.1850946843624115, + "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 @@ -720,13 +720,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34375, + "grad_norm": 0.333984375, "learning_rate": 0.00015, - "loss": 0.1019, + "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, - "routers_loss": 0.09809529036283493, + "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 @@ -739,13 +739,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.26171875, "learning_rate": 0.000154, - "loss": 0.1088, + "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, - "routers_loss": 0.11277207732200623, + "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 @@ -758,13 +758,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.255859375, "learning_rate": 0.000158, - "loss": 0.0866, + "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, - "routers_loss": 0.09079254418611526, + "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 @@ -777,13 +777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1943359375, "learning_rate": 0.000162, - "loss": 0.0928, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, - "routers_loss": 0.02900076098740101, + "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 @@ -796,13 +796,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, - "loss": 0.1251, + "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, - "routers_loss": 0.0763339251279831, + "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 @@ -817,11 +817,11 @@ "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, - "loss": 0.1064, + "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, - "routers_loss": 0.13191410899162292, + "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 @@ -834,13 +834,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25390625, "learning_rate": 0.000174, - "loss": 0.1055, + "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, - "routers_loss": 0.21200031042099, + "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 @@ -853,13 +853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.349609375, "learning_rate": 0.000178, - "loss": 0.0971, + "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, - "routers_loss": 0.031911369413137436, + "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 @@ -872,13 +872,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2265625, "learning_rate": 0.000182, - "loss": 0.1056, + "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, - "routers_loss": 0.14131835103034973, + "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 @@ -891,13 +891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.205078125, "learning_rate": 0.000186, - "loss": 0.1059, + "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, - "routers_loss": 0.04137955233454704, + "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 @@ -910,13 +910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.2138671875, "learning_rate": 0.00019, - "loss": 0.0934, + "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, - "routers_loss": 0.03163003921508789, + "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 @@ -929,13 +929,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2099609375, "learning_rate": 0.000194, - "loss": 0.0847, + "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, - "routers_loss": 0.2567490339279175, + "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 @@ -948,13 +948,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30859375, + "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, - "loss": 0.1077, + "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, - "routers_loss": 0.11468870937824249, + "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 @@ -967,13 +967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1806640625, "learning_rate": 0.000202, - "loss": 0.1131, + "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, - "routers_loss": 0.02124219387769699, + "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 @@ -986,13 +986,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1943359375, "learning_rate": 0.000206, - "loss": 0.0624, + "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, - "routers_loss": 0.06983796507120132, + "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 @@ -1005,13 +1005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1591796875, "learning_rate": 0.00021, - "loss": 0.0951, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, - "routers_loss": 0.03467355668544769, + "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 @@ -1024,13 +1024,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.240234375, "learning_rate": 0.000214, - "loss": 0.0881, + "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, - "routers_loss": 0.08142061531543732, + "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 @@ -1043,13 +1043,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.267578125, "learning_rate": 0.000218, - "loss": 0.0795, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, - "routers_loss": 0.08327355235815048, + "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 @@ -1062,13 +1062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.2353515625, "learning_rate": 0.000222, - "loss": 0.0943, + "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, - "routers_loss": 0.019890006631612778, + "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 @@ -1081,13 +1081,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, - "loss": 0.0933, + "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, - "routers_loss": 0.09992363303899765, + "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 @@ -1100,13 +1100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.3046875, "learning_rate": 0.00023, - "loss": 0.0762, + "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, - "routers_loss": 0.014119029976427555, + "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 @@ -1119,13 +1119,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.423828125, + "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, - "loss": 0.0842, + "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, - "routers_loss": 0.03976766765117645, + "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 @@ -1138,13 +1138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, - "loss": 0.0517, + "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, - "routers_loss": 0.017428619787096977, + "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 @@ -1157,13 +1157,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.35546875, "learning_rate": 0.000242, - "loss": 0.1134, + "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, - "routers_loss": 0.06965513527393341, + "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 @@ -1176,13 +1176,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1875, "learning_rate": 0.000246, - "loss": 0.0984, + "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, - "routers_loss": 0.10476501286029816, + "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 @@ -1195,13 +1195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.00025, - "loss": 0.0771, + "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, - "routers_loss": 0.028317544609308243, + "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 @@ -1214,13 +1214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.390625, + "grad_norm": 0.4296875, "learning_rate": 0.000254, - "loss": 0.0933, + "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, - "routers_loss": 0.012766432017087936, + "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 @@ -1233,13 +1233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, - "loss": 0.0989, + "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, - "routers_loss": 0.021400077268481255, + "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 @@ -1252,13 +1252,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.2060546875, "learning_rate": 0.000262, - "loss": 0.0873, + "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, - "routers_loss": 0.05025051161646843, + "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 @@ -1271,13 +1271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1708984375, "learning_rate": 0.000266, - "loss": 0.085, + "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, - "routers_loss": 0.017420046031475067, + "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 @@ -1290,13 +1290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.220703125, "learning_rate": 0.00027, - "loss": 0.086, + "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, - "routers_loss": 0.018217921257019043, + "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 @@ -1309,13 +1309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, - "loss": 0.0985, + "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, - "routers_loss": 0.012350660748779774, + "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 @@ -1328,13 +1328,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, - "routers_loss": 0.14993029832839966, + "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 @@ -1347,13 +1347,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, - "loss": 0.0791, + "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, - "routers_loss": 0.17921413481235504, + "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 @@ -1366,13 +1366,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, - "loss": 0.0535, + "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, - "routers_loss": 0.1420905590057373, + "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 @@ -1385,13 +1385,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.29296875, + "grad_norm": 0.306640625, "learning_rate": 0.00029, - "loss": 0.0956, + "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, - "routers_loss": 0.12468750029802322, + "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 @@ -1404,13 +1404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1806640625, "learning_rate": 0.000294, - "loss": 0.0879, + "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, - "routers_loss": 0.024295611307024956, + "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 @@ -1423,13 +1423,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.000298, - "loss": 0.087, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, - "routers_loss": 0.07016433775424957, + "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 @@ -1442,13 +1442,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3828125, + "grad_norm": 0.37890625, "learning_rate": 0.000302, - "loss": 0.0782, + "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, - "routers_loss": 0.18942493200302124, + "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 @@ -1461,13 +1461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1787109375, "learning_rate": 0.000306, - "loss": 0.0713, + "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, - "routers_loss": 0.02319060079753399, + "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 @@ -1480,13 +1480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15234375, + "grad_norm": 0.1533203125, "learning_rate": 0.00031, - "loss": 0.0778, + "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, - "routers_loss": 0.01764747127890587, + "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 @@ -1499,13 +1499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1572265625, "learning_rate": 0.000314, - "loss": 0.0829, + "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, - "routers_loss": 0.02268100716173649, + "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 @@ -1518,13 +1518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, - "loss": 0.0889, + "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, - "routers_loss": 0.016952091827988625, + "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 @@ -1537,13 +1537,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2216796875, + "grad_norm": 0.224609375, "learning_rate": 0.000322, - "loss": 0.0923, + "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, - "routers_loss": 0.03669808804988861, + "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 @@ -1556,13 +1556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.212890625, "learning_rate": 0.000326, - "loss": 0.0769, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, - "routers_loss": 0.012101447209715843, + "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 @@ -1575,13 +1575,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.37109375, + "grad_norm": 0.408203125, "learning_rate": 0.00033, - "loss": 0.0897, + "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, - "routers_loss": 0.1562056541442871, + "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 @@ -1594,13 +1594,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, - "loss": 0.0829, + "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, - "routers_loss": 0.20807914435863495, + "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 @@ -1613,13 +1613,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, - "loss": 0.0987, + "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, - "routers_loss": 0.1530539095401764, + "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 @@ -1632,13 +1632,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.17578125, "learning_rate": 0.000342, - "loss": 0.087, + "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, - "routers_loss": 0.08004544675350189, + "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 @@ -1651,13 +1651,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.189453125, "learning_rate": 0.000346, - "loss": 0.0916, + "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, - "routers_loss": 0.19228078424930573, + "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 @@ -1670,13 +1670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1494140625, "learning_rate": 0.00035, - "loss": 0.0863, + "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, - "routers_loss": 0.024507170543074608, + "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 @@ -1689,13 +1689,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2158203125, "learning_rate": 0.000354, - "loss": 0.0898, + "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, - "routers_loss": 0.05055495724081993, + "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 @@ -1708,13 +1708,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.240234375, "learning_rate": 0.000358, - "loss": 0.0865, + "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, - "routers_loss": 0.03999815881252289, + "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 @@ -1727,13 +1727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.14453125, "learning_rate": 0.000362, - "loss": 0.0983, + "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, - "routers_loss": 0.025158070027828217, + "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 @@ -1746,32 +1746,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.158203125, "learning_rate": 0.000366, - "loss": 0.1015, + "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, - "routers_loss": 0.01825365424156189, + "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 0.8734957440563546, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, "learning_rate": 0.00037, - "loss": 0.0736, - "macro_f1": 0.3144654333591461, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, - "routers_loss": 0.22729666531085968, + "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 @@ -1784,13 +1784,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2041015625, "learning_rate": 0.000374, - "loss": 0.0838, + "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, - "routers_loss": 0.24516475200653076, + "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 @@ -1803,13 +1803,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2470703125, + "grad_norm": 0.271484375, "learning_rate": 0.000378, - "loss": 0.1056, + "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, - "routers_loss": 0.1307530701160431, + "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 @@ -1822,13 +1822,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15625, "learning_rate": 0.000382, - "loss": 0.0961, + "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, - "routers_loss": 0.06541688740253448, + "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 @@ -1841,13 +1841,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.34375, "learning_rate": 0.000386, - "loss": 0.1058, + "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, - "routers_loss": 0.12492545694112778, + "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 @@ -1860,13 +1860,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28515625, + "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, - "loss": 0.0966, + "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, - "routers_loss": 0.2838033139705658, + "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 @@ -1881,11 +1881,11 @@ "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, - "loss": 0.0929, + "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, - "routers_loss": 0.07692629098892212, + "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 @@ -1898,13 +1898,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, - "routers_loss": 0.18504399061203003, + "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 @@ -1917,13 +1917,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2490234375, "learning_rate": 0.000402, - "loss": 0.078, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, - "routers_loss": 0.014647359028458595, + "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 @@ -1936,13 +1936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, - "loss": 0.1028, + "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, - "routers_loss": 0.017848484218120575, + "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 @@ -1955,13 +1955,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.27734375, "learning_rate": 0.00041, - "loss": 0.0832, + "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, - "routers_loss": 0.01900508813560009, + "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 @@ -1974,13 +1974,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, - "routers_loss": 0.13018715381622314, + "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 @@ -1993,13 +1993,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, - "loss": 0.0697, + "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, - "routers_loss": 0.055288366973400116, + "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 @@ -2012,13 +2012,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.271484375, "learning_rate": 0.000422, - "loss": 0.0576, + "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, - "routers_loss": 0.10952572524547577, + "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 @@ -2031,13 +2031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.197265625, + "grad_norm": 0.2060546875, "learning_rate": 0.000426, - "loss": 0.062, + "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, - "routers_loss": 0.02415696159005165, + "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 @@ -2050,13 +2050,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.00043, - "loss": 0.1011, + "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, - "routers_loss": 0.06956391036510468, + "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 @@ -2069,13 +2069,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, - "loss": 0.076, + "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, - "routers_loss": 0.1140352189540863, + "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 @@ -2090,11 +2090,11 @@ "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, - "loss": 0.0788, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, - "routers_loss": 0.011621571145951748, + "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 @@ -2107,13 +2107,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, - "routers_loss": 0.05813701078295708, + "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 @@ -2126,13 +2126,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.40234375, "learning_rate": 0.000446, - "loss": 0.0827, + "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, - "routers_loss": 0.0646885335445404, + "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 @@ -2145,13 +2145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, - "loss": 0.1011, + "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, - "routers_loss": 0.07224348932504654, + "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 @@ -2164,13 +2164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, - "loss": 0.0781, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, - "routers_loss": 0.015971746295690536, + "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 @@ -2183,13 +2183,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25, "learning_rate": 0.000458, - "loss": 0.099, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, - "routers_loss": 0.017818331718444824, + "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 @@ -2202,13 +2202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1494140625, "learning_rate": 0.000462, - "loss": 0.0757, + "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, - "routers_loss": 0.01582280732691288, + "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 @@ -2221,13 +2221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.42578125, + "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, - "loss": 0.0876, + "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, - "routers_loss": 0.011417915113270283, + "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 @@ -2240,13 +2240,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.17578125, "learning_rate": 0.00047, - "loss": 0.0801, + "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, - "routers_loss": 0.05787832289934158, + "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 @@ -2259,13 +2259,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.267578125, "learning_rate": 0.000474, - "loss": 0.0508, + "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, - "routers_loss": 0.09476690739393234, + "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 @@ -2278,13 +2278,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, - "loss": 0.0833, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, - "routers_loss": 0.1099705696105957, + "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 @@ -2297,13 +2297,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.171875, "learning_rate": 0.000482, - "loss": 0.0745, + "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, - "routers_loss": 0.01269970741122961, + "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 @@ -2316,13 +2316,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.12060546875, "learning_rate": 0.000486, - "loss": 0.061, + "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, - "routers_loss": 0.08505752682685852, + "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 @@ -2335,13 +2335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1552734375, "learning_rate": 0.00049, - "loss": 0.0504, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, - "routers_loss": 0.012750142253935337, + "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 @@ -2354,13 +2354,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.291015625, + "grad_norm": 0.296875, "learning_rate": 0.000494, - "loss": 0.0962, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, - "routers_loss": 0.11287309974431992, + "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 @@ -2373,32 +2373,32 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.203125, "learning_rate": 0.000498, - "loss": 0.0821, + "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, - "routers_loss": 0.1486474722623825, + "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.183152333431171, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, + "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, - "loss": 0.0832, - "macro_f1": 0.5492662787437439, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, - "routers_loss": 0.06636594980955124, + "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 @@ -2411,13 +2411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.267578125, + "grad_norm": 0.287109375, "learning_rate": 0.000506, - "loss": 0.1, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, - "routers_loss": 0.015062150545418262, + "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 @@ -2430,13 +2430,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.21484375, "learning_rate": 0.00051, - "loss": 0.0808, + "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, - "routers_loss": 0.2051105946302414, + "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 @@ -2449,13 +2449,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.2421875, "learning_rate": 0.000514, - "loss": 0.068, + "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, - "routers_loss": 0.1467045396566391, + "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 @@ -2468,13 +2468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1533203125, "learning_rate": 0.000518, - "loss": 0.0543, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, - "routers_loss": 0.013022038154304028, + "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 @@ -2487,13 +2487,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2294921875, "learning_rate": 0.000522, - "loss": 0.0848, + "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, - "routers_loss": 0.2575930058956146, + "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 @@ -2506,13 +2506,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.17578125, "learning_rate": 0.000526, - "loss": 0.07, + "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, - "routers_loss": 0.0558602549135685, + "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 @@ -2525,13 +2525,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, - "loss": 0.082, + "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, - "routers_loss": 0.09126655012369156, + "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 @@ -2544,13 +2544,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, - "loss": 0.0764, + "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, - "routers_loss": 0.24805288016796112, + "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 @@ -2563,13 +2563,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, - "loss": 0.0686, + "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, - "routers_loss": 0.13135533034801483, + "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 @@ -2582,13 +2582,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, - "loss": 0.1083, + "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, - "routers_loss": 0.04991440102458, + "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 @@ -2601,13 +2601,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.455078125, + "grad_norm": 0.44921875, "learning_rate": 0.000546, - "loss": 0.0991, + "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, - "routers_loss": 0.12236632406711578, + "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 @@ -2620,13 +2620,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25, + "grad_norm": 0.2578125, "learning_rate": 0.00055, - "loss": 0.0936, + "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, - "routers_loss": 0.053506772965192795, + "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 @@ -2639,13 +2639,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.29296875, "learning_rate": 0.000554, - "loss": 0.066, + "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, - "routers_loss": 0.13446088135242462, + "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 @@ -2658,32 +2658,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.185546875, "learning_rate": 0.000558, - "loss": 0.0682, + "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, - "routers_loss": 0.07270720601081848, + "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.3240387437628411, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.28125, + "f1_skip": 0.0, + "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, - "loss": 0.0648, - "macro_f1": 0.5427350401878357, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, - "routers_loss": 0.13866399228572845, + "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 @@ -2696,13 +2696,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.265625, "learning_rate": 0.000566, - "loss": 0.0782, + "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, - "routers_loss": 0.0645354762673378, + "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 @@ -2715,13 +2715,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1650390625, "learning_rate": 0.00057, - "loss": 0.0892, + "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, - "routers_loss": 0.05967628210783005, + "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 @@ -2734,13 +2734,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2158203125, "learning_rate": 0.000574, - "loss": 0.0676, + "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, - "routers_loss": 0.06438407301902771, + "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 @@ -2753,13 +2753,13 @@ "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28515625, "learning_rate": 0.000578, - "loss": 0.0781, + "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, - "routers_loss": 0.21225209534168243, + "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 @@ -2772,13 +2772,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, - "loss": 0.0664, + "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, - "routers_loss": 0.08085516840219498, + "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 @@ -2791,13 +2791,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, - "loss": 0.0874, + "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, - "routers_loss": 0.05378658324480057, + "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 @@ -2810,13 +2810,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.2177734375, "learning_rate": 0.00059, - "loss": 0.0715, + "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, - "routers_loss": 0.01145261898636818, + "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 @@ -2831,11 +2831,11 @@ "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, - "loss": 0.0737, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, - "routers_loss": 0.009397956542670727, + "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 @@ -2848,13 +2848,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.181640625, "learning_rate": 0.000598, - "loss": 0.0802, + "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, - "routers_loss": 0.2389357089996338, + "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 @@ -2862,18 +2862,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 1.417963017317288, - "f1_execute": 0.9019607901573181, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2080078125, "learning_rate": 0.000602, - "loss": 0.0745, - "macro_f1": 0.3006536066532135, + "loss": 0.073, + "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, - "routers_loss": 0.18252353370189667, + "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 @@ -2886,13 +2886,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.27734375, + "grad_norm": 0.279296875, "learning_rate": 0.000606, - "loss": 0.0935, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, - "routers_loss": 0.18185268342494965, + "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 @@ -2905,13 +2905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1943359375, "learning_rate": 0.00061, - "loss": 0.0853, + "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, - "routers_loss": 0.013210167177021503, + "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 @@ -2924,13 +2924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.26953125, "learning_rate": 0.000614, - "loss": 0.1089, + "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, - "routers_loss": 0.016936838626861572, + "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 @@ -2943,13 +2943,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, - "loss": 0.077, + "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, - "routers_loss": 0.08630389720201492, + "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 @@ -2962,13 +2962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.19140625, "learning_rate": 0.000622, - "loss": 0.0602, + "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, - "routers_loss": 0.013665963895618916, + "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 @@ -2981,13 +2981,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.205078125, "learning_rate": 0.000626, - "loss": 0.0794, + "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, - "routers_loss": 0.01584783010184765, + "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 @@ -3000,13 +3000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2216796875, "learning_rate": 0.00063, - "loss": 0.0762, + "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, - "routers_loss": 0.01368923019617796, + "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 @@ -3019,13 +3019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.388671875, + "grad_norm": 0.400390625, "learning_rate": 0.000634, - "loss": 0.0908, + "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, - "routers_loss": 0.009135022759437561, + "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 @@ -3038,13 +3038,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.15234375, "learning_rate": 0.000638, - "loss": 0.0949, + "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, - "routers_loss": 0.046641621738672256, + "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 @@ -3052,18 +3052,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.5118872908717347, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.259765625, "learning_rate": 0.000642, - "loss": 0.0925, - "macro_f1": 0.3333333432674408, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, - "routers_loss": 0.020637936890125275, + "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 @@ -3076,13 +3076,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26953125, + "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, - "routers_loss": 0.08289298415184021, + "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 @@ -3090,18 +3090,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.530672145582624, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, - "loss": 0.0823, - "macro_f1": 0.3272727429866791, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, - "routers_loss": 0.06960040330886841, + "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 @@ -3114,13 +3114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, - "loss": 0.0799, + "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, - "routers_loss": 0.02087482251226902, + "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 @@ -3133,13 +3133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, - "loss": 0.0757, + "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, - "routers_loss": 0.016592051833868027, + "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 @@ -3152,32 +3152,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.220703125, "learning_rate": 0.000662, - "loss": 0.0438, + "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, - "routers_loss": 0.012950568459928036, + "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 1.5682418550044028, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.296875, "learning_rate": 0.000666, - "loss": 0.0964, - "macro_f1": 0.29333335161209106, + "loss": 0.0963, + "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, - "routers_loss": 0.3373340964317322, + "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 @@ -3190,13 +3190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, - "routers_loss": 0.008110735565423965, + "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 @@ -3209,13 +3209,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.2421875, "learning_rate": 0.000674, - "loss": 0.0771, + "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, - "routers_loss": 0.01841609925031662, + "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 @@ -3228,13 +3228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, - "loss": 0.0894, + "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, - "routers_loss": 0.01612614095211029, + "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 @@ -3247,13 +3247,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, - "loss": 0.0611, + "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, - "routers_loss": 0.26202192902565, + "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 @@ -3266,13 +3266,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, - "loss": 0.1013, + "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, - "routers_loss": 0.09235779196023941, + "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 @@ -3285,13 +3285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.255859375, "learning_rate": 0.00069, - "loss": 0.0856, + "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, - "routers_loss": 0.010735333897173405, + "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 @@ -3304,13 +3304,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2138671875, "learning_rate": 0.000694, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, - "routers_loss": 0.14742356538772583, + "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 @@ -3323,13 +3323,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30859375, + "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, - "loss": 0.0614, + "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, - "routers_loss": 0.06606879830360413, + "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 @@ -3342,13 +3342,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.322265625, + "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, - "loss": 0.1033, + "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, - "routers_loss": 0.012873432599008083, + "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 @@ -3361,13 +3361,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, - "loss": 0.0819, + "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, - "routers_loss": 0.07853665202856064, + "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 @@ -3380,13 +3380,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.263671875, "learning_rate": 0.00071, - "loss": 0.0804, + "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, - "routers_loss": 0.2216549813747406, + "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 @@ -3399,13 +3399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1884765625, "learning_rate": 0.000714, - "loss": 0.0675, + "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, - "routers_loss": 0.02423691377043724, + "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 @@ -3413,18 +3413,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.6903434106251836, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.21484375, "learning_rate": 0.000718, - "loss": 0.0781, - "macro_f1": 0.3272727429866791, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, - "routers_loss": 0.07496294379234314, + "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 @@ -3437,13 +3437,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.197265625, "learning_rate": 0.000722, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, - "routers_loss": 0.08181872963905334, + "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 @@ -3456,13 +3456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2216796875, "learning_rate": 0.000726, - "loss": 0.1112, + "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, - "routers_loss": 0.016959719359874725, + "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 @@ -3475,13 +3475,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.15625, "learning_rate": 0.00073, - "loss": 0.0577, + "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, - "routers_loss": 0.13295969367027283, + "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 @@ -3494,13 +3494,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.150390625, "learning_rate": 0.000734, - "loss": 0.0986, + "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, - "routers_loss": 0.02476893551647663, + "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 @@ -3513,13 +3513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1796875, "learning_rate": 0.000738, - "loss": 0.0682, + "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, - "routers_loss": 0.019863395020365715, + "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 @@ -3532,13 +3532,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2412109375, "learning_rate": 0.000742, - "loss": 0.0663, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, - "routers_loss": 0.07230417430400848, + "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 @@ -3551,13 +3551,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2412109375, "learning_rate": 0.000746, - "loss": 0.0986, + "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, - "routers_loss": 0.11727793514728546, + "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 @@ -3570,13 +3570,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2265625, "learning_rate": 0.00075, - "loss": 0.0724, + "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, - "routers_loss": 0.13495951890945435, + "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 @@ -3589,13 +3589,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.2333984375, "learning_rate": 0.000754, - "loss": 0.0823, + "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, - "routers_loss": 0.07612533867359161, + "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 @@ -3608,13 +3608,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1826171875, "learning_rate": 0.000758, - "loss": 0.0803, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, - "routers_loss": 0.0484120175242424, + "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 @@ -3627,13 +3627,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1689453125, "learning_rate": 0.000762, - "loss": 0.0866, + "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, - "routers_loss": 0.10939671844244003, + "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 @@ -3646,13 +3646,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, - "loss": 0.1083, + "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, - "routers_loss": 0.11382336914539337, + "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 @@ -3667,11 +3667,11 @@ "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, - "loss": 0.0616, + "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, - "routers_loss": 0.07494530081748962, + "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 @@ -3684,13 +3684,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, - "loss": 0.0816, + "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, - "routers_loss": 0.05718417093157768, + "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 @@ -3703,13 +3703,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.2099609375, "learning_rate": 0.000778, - "loss": 0.0783, + "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, - "routers_loss": 0.2848989963531494, + "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 @@ -3722,13 +3722,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30078125, + "grad_norm": 0.30859375, "learning_rate": 0.000782, - "loss": 0.0608, + "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, - "routers_loss": 0.2050076276063919, + "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 @@ -3741,13 +3741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.29296875, "learning_rate": 0.000786, - "loss": 0.0863, + "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, - "routers_loss": 0.020946886390447617, + "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 @@ -3760,13 +3760,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.376953125, + "grad_norm": 0.37890625, "learning_rate": 0.00079, - "loss": 0.0798, + "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, - "routers_loss": 0.1270289123058319, + "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 @@ -3779,13 +3779,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, - "loss": 0.0701, + "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, - "routers_loss": 0.08012636005878448, + "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 @@ -3798,13 +3798,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, - "loss": 0.0901, + "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, - "routers_loss": 0.09315784275531769, + "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 @@ -3817,13 +3817,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, - "loss": 0.078, + "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, - "routers_loss": 0.18492189049720764, + "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 @@ -3836,13 +3836,13 @@ "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, - "loss": 0.0801, + "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, - "routers_loss": 0.32641324400901794, + "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 @@ -3855,13 +3855,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, - "loss": 0.0905, + "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, - "routers_loss": 0.02722037397325039, + "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 @@ -3874,13 +3874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, - "loss": 0.0958, + "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, - "routers_loss": 0.010129833593964577, + "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 @@ -3893,13 +3893,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2373046875, + "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, - "loss": 0.1084, + "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, - "routers_loss": 0.07298308610916138, + "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 @@ -3912,13 +3912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, - "loss": 0.0802, + "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, - "routers_loss": 0.024257874116301537, + "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 @@ -3931,13 +3931,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1650390625, "learning_rate": 0.000826, - "loss": 0.0842, + "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, - "routers_loss": 0.048864223062992096, + "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 @@ -3950,13 +3950,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1513671875, "learning_rate": 0.00083, - "loss": 0.1026, + "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, - "routers_loss": 0.1592330038547516, + "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 @@ -3969,13 +3969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.263671875, "learning_rate": 0.000834, - "loss": 0.0963, + "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, - "routers_loss": 0.02291976846754551, + "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 @@ -3988,13 +3988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10888671875, "learning_rate": 0.000838, - "loss": 0.0634, + "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, - "routers_loss": 0.010272650048136711, + "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 @@ -4007,13 +4007,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.263671875, "learning_rate": 0.000842, - "loss": 0.0786, + "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, - "routers_loss": 0.0692613497376442, + "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 @@ -4026,13 +4026,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.1318359375, "learning_rate": 0.000846, - "loss": 0.0706, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, - "routers_loss": 0.12713804841041565, + "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 @@ -4045,13 +4045,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2158203125, "learning_rate": 0.00085, - "loss": 0.0758, + "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, - "routers_loss": 0.08670130372047424, + "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 @@ -4064,13 +4064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.23828125, "learning_rate": 0.000854, - "loss": 0.0857, + "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, - "routers_loss": 0.01053862925618887, + "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 @@ -4083,13 +4083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1435546875, "learning_rate": 0.000858, - "loss": 0.0615, + "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, - "routers_loss": 0.012946994043886662, + "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 @@ -4102,13 +4102,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1552734375, "learning_rate": 0.000862, - "loss": 0.0498, + "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, - "routers_loss": 0.08222822099924088, + "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 @@ -4121,13 +4121,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.173828125, "learning_rate": 0.000866, - "loss": 0.0532, + "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, - "routers_loss": 0.07086442410945892, + "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 @@ -4140,13 +4140,13 @@ "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1943359375, "learning_rate": 0.00087, - "loss": 0.0825, + "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, - "routers_loss": 0.29007306694984436, + "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 @@ -4159,13 +4159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.396484375, + "grad_norm": 0.423828125, "learning_rate": 0.000874, - "loss": 0.0658, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, - "routers_loss": 0.014652491547167301, + "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 @@ -4178,13 +4178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.000878, - "loss": 0.0685, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, - "routers_loss": 0.013720969669520855, + "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 @@ -4197,13 +4197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.171875, "learning_rate": 0.000882, - "loss": 0.0771, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, - "routers_loss": 0.011687638238072395, + "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 @@ -4216,13 +4216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, - "loss": 0.0604, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, - "routers_loss": 0.007869532331824303, + "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 @@ -4230,18 +4230,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.0939242735544465, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, - "loss": 0.0797, - "macro_f1": 0.3076923191547394, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, - "routers_loss": 0.3034668564796448, + "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 @@ -4254,13 +4254,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.203125, "learning_rate": 0.000894, - "loss": 0.0823, + "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, - "routers_loss": 0.11066079139709473, + "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 @@ -4273,13 +4273,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.33984375, "learning_rate": 0.000898, - "loss": 0.0773, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, - "routers_loss": 0.0755370482802391, + "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 @@ -4292,13 +4292,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.3203125, "learning_rate": 0.000902, - "loss": 0.0596, + "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, - "routers_loss": 0.08470689505338669, + "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 @@ -4311,13 +4311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.1953125, "learning_rate": 0.000906, - "loss": 0.0608, + "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, - "routers_loss": 0.0130238626152277, + "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 @@ -4330,13 +4330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.00091, - "loss": 0.0652, + "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, - "routers_loss": 0.007108641788363457, + "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 @@ -4351,11 +4351,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, - "loss": 0.0746, + "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, - "routers_loss": 0.06834109872579575, + "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 @@ -4368,13 +4368,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, - "loss": 0.0733, + "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, - "routers_loss": 0.10230778902769089, + "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 @@ -4387,13 +4387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, - "loss": 0.0528, + "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, - "routers_loss": 0.009987542405724525, + "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 @@ -4406,13 +4406,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, - "loss": 0.0536, + "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, - "routers_loss": 0.03448869287967682, + "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 @@ -4425,13 +4425,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.173828125, "learning_rate": 0.00093, - "loss": 0.053, + "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, - "routers_loss": 0.13631699979305267, + "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 @@ -4444,13 +4444,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.142578125, "learning_rate": 0.000934, - "loss": 0.06, + "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, - "routers_loss": 0.053951870650053024, + "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 @@ -4463,13 +4463,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, - "loss": 0.059, + "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, - "routers_loss": 0.14479905366897583, + "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 @@ -4482,13 +4482,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.44140625, + "grad_norm": 0.5, "learning_rate": 0.000942, - "loss": 0.0913, + "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, - "routers_loss": 0.056221429258584976, + "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 @@ -4501,13 +4501,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.212890625, "learning_rate": 0.000946, - "loss": 0.0591, + "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, - "routers_loss": 0.09729792177677155, + "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 @@ -4520,13 +4520,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1259765625, "learning_rate": 0.00095, - "loss": 0.0496, + "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, - "routers_loss": 0.029447713866829872, + "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 @@ -4539,13 +4539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.271484375, + "grad_norm": 0.291015625, "learning_rate": 0.000954, - "loss": 0.0801, + "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, - "routers_loss": 0.09337342530488968, + "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 @@ -4560,11 +4560,11 @@ "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, - "loss": 0.1102, + "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, - "routers_loss": 0.23193210363388062, + "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 @@ -4572,18 +4572,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.2629879659524508, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.146484375, "learning_rate": 0.000962, - "loss": 0.0669, - "macro_f1": 0.3272727429866791, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, - "routers_loss": 0.046257760375738144, + "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 @@ -4596,13 +4596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.197265625, "learning_rate": 0.000966, - "loss": 0.0552, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, - "routers_loss": 0.01683143898844719, + "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 @@ -4615,13 +4615,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, - "loss": 0.071, + "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, - "routers_loss": 0.04129387438297272, + "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 @@ -4634,13 +4634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000974, - "loss": 0.0605, + "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, - "routers_loss": 0.01262948103249073, + "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 @@ -4653,13 +4653,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.263671875, "learning_rate": 0.000978, - "loss": 0.081, + "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, - "routers_loss": 0.07404553890228271, + "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 @@ -4672,13 +4672,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2158203125, "learning_rate": 0.000982, - "loss": 0.0751, + "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, - "routers_loss": 0.06795930862426758, + "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 @@ -4691,13 +4691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, - "loss": 0.0804, + "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, - "routers_loss": 0.02233024686574936, + "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 @@ -4710,13 +4710,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2138671875, "learning_rate": 0.00099, - "loss": 0.0731, + "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, - "routers_loss": 0.07979031652212143, + "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 @@ -4729,13 +4729,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1298828125, + "grad_norm": 0.130859375, "learning_rate": 0.000994, - "loss": 0.0795, + "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, - "routers_loss": 0.045646365731954575, + "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 @@ -4748,13 +4748,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, - "routers_loss": 0.09717849642038345, + "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 @@ -4767,13 +4767,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30078125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, - "loss": 0.0894, + "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, - "routers_loss": 0.03948225453495979, + "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 @@ -4786,13 +4786,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, - "loss": 0.0557, + "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, - "routers_loss": 0.0742638111114502, + "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 @@ -4805,13 +4805,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.25, "learning_rate": 0.000999999401247153, - "loss": 0.0682, + "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, - "routers_loss": 0.08293049037456512, + "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 @@ -4824,13 +4824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, - "loss": 0.0697, + "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, - "routers_loss": 0.010080376639962196, + "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 @@ -4843,13 +4843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, - "loss": 0.0611, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, - "routers_loss": 0.009179878048598766, + "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 @@ -4862,13 +4862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11083984375, + "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, - "loss": 0.0689, + "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, - "routers_loss": 0.006718529388308525, + "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 @@ -4881,13 +4881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, - "loss": 0.0826, + "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, - "routers_loss": 0.049344487488269806, + "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 @@ -4900,13 +4900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, - "loss": 0.0739, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, - "routers_loss": 0.013402626849710941, + "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 @@ -4919,13 +4919,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, - "loss": 0.0761, + "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, - "routers_loss": 0.16964484751224518, + "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 @@ -4938,13 +4938,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, - "loss": 0.095, + "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, - "routers_loss": 0.08609295636415482, + "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 @@ -4957,13 +4957,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2392578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, - "loss": 0.0816, + "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, - "routers_loss": 0.05354784056544304, + "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 @@ -4976,13 +4976,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.2236328125, + "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, - "loss": 0.0715, + "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, - "routers_loss": 0.09146631509065628, + "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 @@ -4995,13 +4995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, - "loss": 0.0574, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, - "routers_loss": 0.02344255894422531, + "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 @@ -5014,13 +5014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, - "loss": 0.0621, + "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, - "routers_loss": 0.018271517008543015, + "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 @@ -5033,13 +5033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, - "loss": 0.0717, + "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, - "routers_loss": 0.026990914717316628, + "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 @@ -5052,13 +5052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, - "loss": 0.0681, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, - "routers_loss": 0.019737249240279198, + "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 @@ -5071,13 +5071,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.3046875, + "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, - "loss": 0.0978, + "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, - "routers_loss": 0.212640181183815, + "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 @@ -5090,13 +5090,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, - "loss": 0.0602, + "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, - "routers_loss": 0.07302755117416382, + "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 @@ -5109,13 +5109,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, - "loss": 0.0825, + "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, - "routers_loss": 0.08667246252298355, + "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 @@ -5128,13 +5128,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, - "loss": 0.0597, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, - "routers_loss": 0.015047167427837849, + "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 @@ -5147,13 +5147,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, - "loss": 0.076, + "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, - "routers_loss": 0.07481446117162704, + "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 @@ -5166,13 +5166,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, - "loss": 0.0724, + "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, - "routers_loss": 0.049495212733745575, + "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 @@ -5185,13 +5185,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, - "loss": 0.0718, + "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, - "routers_loss": 0.08043002337217331, + "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 @@ -5204,13 +5204,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34765625, + "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, - "loss": 0.086, + "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, - "routers_loss": 0.22461950778961182, + "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 @@ -5223,13 +5223,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, - "loss": 0.0798, + "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, - "routers_loss": 0.11754962801933289, + "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 @@ -5242,13 +5242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, - "loss": 0.0978, + "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, - "routers_loss": 0.017412789165973663, + "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 @@ -5261,13 +5261,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, - "loss": 0.0792, + "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, - "routers_loss": 0.08969525247812271, + "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 @@ -5280,13 +5280,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, - "loss": 0.0803, + "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, - "routers_loss": 0.09876437485218048, + "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 @@ -5299,13 +5299,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, - "loss": 0.0887, + "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, - "routers_loss": 0.019108204171061516, + "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 @@ -5318,32 +5318,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, - "loss": 0.0573, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, - "routers_loss": 0.019048813730478287, + "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 2.638685060170238, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "f1_skip": 0.5, + "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, - "loss": 0.0947, - "macro_f1": 0.3144654333591461, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, - "routers_loss": 0.11889495700597763, + "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 @@ -5356,13 +5356,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, - "loss": 0.0771, + "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, - "routers_loss": 0.06202332302927971, + "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 @@ -5375,13 +5375,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, - "loss": 0.0623, + "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, - "routers_loss": 0.08294244855642319, + "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 @@ -5394,13 +5394,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, - "loss": 0.0885, + "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, - "routers_loss": 0.07545182853937149, + "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 @@ -5413,13 +5413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, - "loss": 0.0712, + "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, - "routers_loss": 0.008711219765245914, + "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 @@ -5432,13 +5432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, - "loss": 0.0837, + "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, - "routers_loss": 0.01639273390173912, + "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 @@ -5451,13 +5451,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, - "loss": 0.0707, + "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, - "routers_loss": 0.04997137933969498, + "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 @@ -5470,13 +5470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, - "loss": 0.0799, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, - "routers_loss": 0.011360209435224533, + "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 @@ -5489,13 +5489,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, - "loss": 0.0658, + "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, - "routers_loss": 0.31349560618400574, + "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 @@ -5508,13 +5508,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, - "loss": 0.0801, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, - "routers_loss": 0.058660347014665604, + "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 @@ -5527,13 +5527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, - "loss": 0.0578, + "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, - "routers_loss": 0.025836754590272903, + "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 @@ -5546,13 +5546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, - "loss": 0.0683, + "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, - "routers_loss": 0.016504868865013123, + "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 @@ -5565,13 +5565,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, - "loss": 0.0685, + "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, - "routers_loss": 0.1379794180393219, + "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 @@ -5584,13 +5584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, - "loss": 0.0657, + "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, - "routers_loss": 0.01802757754921913, + "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 @@ -5603,13 +5603,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, - "loss": 0.0762, + "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, - "routers_loss": 0.006902900990098715, + "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 @@ -5622,13 +5622,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, - "loss": 0.0912, + "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, - "routers_loss": 0.08348741382360458, + "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 @@ -5641,13 +5641,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, - "loss": 0.0527, + "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, - "routers_loss": 0.08290062099695206, + "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 @@ -5660,13 +5660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, - "loss": 0.0643, + "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, - "routers_loss": 0.018620988354086876, + "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 @@ -5679,13 +5679,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, - "loss": 0.073, + "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, - "routers_loss": 0.1211671382188797, + "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 @@ -5698,13 +5698,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, - "loss": 0.079, + "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, - "routers_loss": 0.15485027432441711, + "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 @@ -5717,13 +5717,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, - "loss": 0.0562, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, - "routers_loss": 0.036684274673461914, + "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 @@ -5731,18 +5731,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.835926034634576, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, - "loss": 0.0985, - "macro_f1": 0.3333333432674408, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, - "routers_loss": 0.026901578530669212, + "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 @@ -5755,13 +5755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, - "loss": 0.0632, + "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, - "routers_loss": 0.01700405217707157, + "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 @@ -5774,13 +5774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, - "loss": 0.0758, + "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, - "routers_loss": 0.015013590455055237, + "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 @@ -5793,13 +5793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, - "loss": 0.0576, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, - "routers_loss": 0.02037946693599224, + "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 @@ -5812,13 +5812,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, - "loss": 0.0648, + "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, - "routers_loss": 0.22834022343158722, + "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 @@ -5831,13 +5831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, - "loss": 0.0449, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, - "routers_loss": 0.009838113561272621, + "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 @@ -5850,13 +5850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, - "loss": 0.1009, + "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, - "routers_loss": 0.005943270865827799, + "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 @@ -5869,13 +5869,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.38671875, + "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, - "loss": 0.0941, + "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, - "routers_loss": 0.21597740054130554, + "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 @@ -5888,13 +5888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, - "loss": 0.0896, + "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, - "routers_loss": 0.023726588115096092, + "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 @@ -5907,13 +5907,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, - "routers_loss": 0.08467255532741547, + "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 @@ -5926,13 +5926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, - "loss": 0.0816, + "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, - "routers_loss": 0.029468854889273643, + "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 @@ -5945,13 +5945,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, - "loss": 0.0891, + "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, - "routers_loss": 0.09438466280698776, + "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 @@ -5964,13 +5964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, - "loss": 0.0581, + "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, - "routers_loss": 0.013679586350917816, + "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 @@ -5983,13 +5983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, - "loss": 0.0528, + "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, - "routers_loss": 0.029532987624406815, + "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 @@ -6002,13 +6002,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, - "loss": 0.0601, + "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, - "routers_loss": 0.05516733601689339, + "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 @@ -6021,13 +6021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, - "loss": 0.0785, + "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, - "routers_loss": 0.010254866443574429, + "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 @@ -6040,13 +6040,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, - "loss": 0.0518, + "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, - "routers_loss": 0.07528360933065414, + "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 @@ -6059,13 +6059,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1064453125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, - "loss": 0.0844, + "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, - "routers_loss": 0.04301584139466286, + "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 @@ -6078,13 +6078,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, - "loss": 0.0699, + "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, - "routers_loss": 0.14521080255508423, + "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 @@ -6097,13 +6097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, - "loss": 0.0543, + "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, - "routers_loss": 0.01074797473847866, + "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 @@ -6116,13 +6116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, - "loss": 0.0659, + "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, - "routers_loss": 0.009271817281842232, + "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 @@ -6135,13 +6135,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, - "loss": 0.0737, + "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, - "routers_loss": 0.10257050395011902, + "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 @@ -6154,13 +6154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, - "loss": 0.0363, + "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, - "routers_loss": 0.07091924548149109, + "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 @@ -6173,13 +6173,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, - "loss": 0.0421, + "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, - "routers_loss": 0.034446243196725845, + "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 @@ -6192,13 +6192,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, - "loss": 0.0593, + "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, - "routers_loss": 0.06077485531568527, + "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 @@ -6211,13 +6211,13 @@ "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2294921875, + "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, - "loss": 0.0537, + "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, - "routers_loss": 0.2382282167673111, + "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 @@ -6230,13 +6230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, - "loss": 0.0613, + "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, - "routers_loss": 0.011971636675298214, + "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 @@ -6249,13 +6249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.212890625, + "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, - "loss": 0.0414, + "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, - "routers_loss": 0.010221127420663834, + "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 @@ -6268,13 +6268,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2265625, + "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, - "loss": 0.061, + "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, - "routers_loss": 0.11860335618257523, + "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 @@ -6287,13 +6287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, - "loss": 0.0485, + "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, - "routers_loss": 0.011139829643070698, + "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 @@ -6306,13 +6306,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, - "loss": 0.0478, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, - "routers_loss": 0.03978770971298218, + "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 @@ -6327,11 +6327,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, - "loss": 0.0481, + "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, - "routers_loss": 0.051231011748313904, + "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 @@ -6344,13 +6344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, - "loss": 0.0615, + "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, - "routers_loss": 0.024917088449001312, + "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 @@ -6363,13 +6363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, - "loss": 0.0627, + "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, - "routers_loss": 0.008834881708025932, + "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 @@ -6382,13 +6382,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, - "routers_loss": 0.033405229449272156, + "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 @@ -6401,13 +6401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, - "loss": 0.0523, + "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, - "routers_loss": 0.020753704011440277, + "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 @@ -6420,13 +6420,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, - "loss": 0.0698, + "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, - "routers_loss": 0.06932353973388672, + "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 @@ -6439,13 +6439,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, - "loss": 0.059, + "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, - "routers_loss": 0.032903749495744705, + "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 @@ -6458,32 +6458,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.2099609375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, - "loss": 0.0417, + "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, - "routers_loss": 0.19733747839927673, + "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.2019371881420606, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.154296875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, - "loss": 0.0729, - "macro_f1": 0.6666666865348816, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, - "routers_loss": 0.013452666811645031, + "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 @@ -6496,13 +6496,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, - "routers_loss": 0.05302857980132103, + "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 @@ -6515,13 +6515,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, - "loss": 0.0527, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, - "routers_loss": 0.08124994486570358, + "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 @@ -6534,13 +6534,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, - "loss": 0.0579, + "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, - "routers_loss": 0.058633625507354736, + "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 @@ -6553,13 +6553,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, - "loss": 0.0533, + "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, - "routers_loss": 0.04703643172979355, + "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 @@ -6572,13 +6572,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, - "loss": 0.0485, + "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, - "routers_loss": 0.11615128815174103, + "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 @@ -6591,13 +6591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, - "loss": 0.0718, + "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, - "routers_loss": 0.017403846606612206, + "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 @@ -6610,13 +6610,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, - "loss": 0.0444, + "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, - "routers_loss": 0.07067303359508514, + "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 @@ -6629,13 +6629,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, - "loss": 0.0527, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, - "routers_loss": 0.07131028175354004, + "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 @@ -6648,13 +6648,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, - "loss": 0.0629, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, - "routers_loss": 0.1152748316526413, + "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 @@ -6667,13 +6667,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, - "loss": 0.0663, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, - "routers_loss": 0.0077212234027683735, + "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 @@ -6686,13 +6686,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.099609375, + "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, - "loss": 0.0494, + "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, - "routers_loss": 0.02726336568593979, + "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 @@ -6705,13 +6705,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.32421875, + "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, - "loss": 0.0615, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, - "routers_loss": 0.0958542674779892, + "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 @@ -6724,13 +6724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, - "loss": 0.0583, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, - "routers_loss": 0.007100600749254227, + "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 @@ -6743,13 +6743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, - "loss": 0.0445, + "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, - "routers_loss": 0.0047812811098992825, + "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 @@ -6762,13 +6762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, - "loss": 0.052, + "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, - "routers_loss": 0.006643407512456179, + "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 @@ -6781,13 +6781,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2890625, + "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, - "loss": 0.0719, + "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, - "routers_loss": 0.0910436138510704, + "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 @@ -6800,13 +6800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, - "loss": 0.0649, + "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, - "routers_loss": 0.010978946462273598, + "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 @@ -6819,13 +6819,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, - "loss": 0.0543, + "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, - "routers_loss": 0.009956461377441883, + "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 @@ -6838,13 +6838,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, - "loss": 0.0412, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, - "routers_loss": 0.057210199534893036, + "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 @@ -6857,32 +6857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, - "loss": 0.0364, + "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, - "routers_loss": 0.01035996899008751, + "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.3991781626063986, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, - "loss": 0.0661, - "macro_f1": 0.3076923191547394, + "loss": 0.063, + "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, - "routers_loss": 0.18087820708751678, + "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 @@ -6895,32 +6895,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, - "loss": 0.0505, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, - "routers_loss": 0.04720238968729973, + "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.417963017317288, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.2216796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, - "loss": 0.0603, - "macro_f1": 0.6666666865348816, + "loss": 0.06, + "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, - "routers_loss": 0.015407778322696686, + "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 @@ -6933,13 +6933,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, - "loss": 0.0489, + "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, - "routers_loss": 0.060891781002283096, + "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 @@ -6947,18 +6947,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.436747872028177, - "f1_execute": 0.943396270275116, + "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, - "loss": 0.0825, - "macro_f1": 0.3144654333591461, + "loss": 0.0796, + "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, - "routers_loss": 0.1661442220211029, + "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 @@ -6966,18 +6966,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.446140299383622, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, - "loss": 0.0634, - "macro_f1": 0.3333333432674408, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, - "routers_loss": 0.02108248695731163, + "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 @@ -6990,13 +6990,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, - "loss": 0.0534, + "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, - "routers_loss": 0.08318125456571579, + "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 @@ -7009,13 +7009,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, - "loss": 0.0514, + "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, - "routers_loss": 0.015889234840869904, + "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 @@ -7028,13 +7028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, - "routers_loss": 0.01378484908491373, + "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 @@ -7047,13 +7047,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, - "loss": 0.076, + "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, - "routers_loss": 0.02673683874309063, + "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 @@ -7066,13 +7066,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, - "loss": 0.0778, + "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, - "routers_loss": 0.27776041626930237, + "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 @@ -7085,13 +7085,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, - "loss": 0.0575, + "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, - "routers_loss": 0.0575483962893486, + "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 @@ -7104,13 +7104,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, - "loss": 0.0478, + "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, - "routers_loss": 0.0458797849714756, + "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 @@ -7123,13 +7123,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, - "loss": 0.0701, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, - "routers_loss": 0.07850238680839539, + "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 @@ -7142,13 +7142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, - "loss": 0.0702, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, - "routers_loss": 0.009507967159152031, + "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 @@ -7161,13 +7161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, - "loss": 0.0543, + "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, - "routers_loss": 0.02620997279882431, + "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 @@ -7180,13 +7180,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, - "loss": 0.0791, + "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, - "routers_loss": 0.02798631228506565, + "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 @@ -7201,11 +7201,11 @@ "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, - "loss": 0.046, + "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, - "routers_loss": 0.16614431142807007, + "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 @@ -7218,13 +7218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.193359375, + "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, - "loss": 0.0669, + "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, - "routers_loss": 0.008541896007955074, + "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 @@ -7237,13 +7237,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, - "loss": 0.0478, + "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, - "routers_loss": 0.045411624014377594, + "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 @@ -7256,13 +7256,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.380859375, + "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, - "loss": 0.0689, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, - "routers_loss": 0.052299100905656815, + "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 @@ -7275,13 +7275,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, - "loss": 0.0602, + "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, - "routers_loss": 0.09140212833881378, + "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 @@ -7294,13 +7294,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.201171875, + "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, - "loss": 0.0475, + "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, - "routers_loss": 0.030750583857297897, + "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 @@ -7313,13 +7313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, - "loss": 0.052, + "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, - "routers_loss": 0.010202950797975063, + "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 @@ -7332,13 +7332,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, - "loss": 0.0434, + "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, - "routers_loss": 0.07364192605018616, + "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 @@ -7353,11 +7353,11 @@ "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, - "loss": 0.0606, + "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, - "routers_loss": 0.06553081423044205, + "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 @@ -7370,13 +7370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, - "loss": 0.0475, + "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, - "routers_loss": 0.008751659654080868, + "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 @@ -7389,13 +7389,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.12060546875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, - "loss": 0.0777, + "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, - "routers_loss": 0.24522721767425537, + "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 @@ -7408,13 +7408,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, - "routers_loss": 0.03767623379826546, + "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 @@ -7427,13 +7427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, - "loss": 0.0708, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, - "routers_loss": 0.006098059006035328, + "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 @@ -7446,13 +7446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, - "loss": 0.0589, + "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, - "routers_loss": 0.01731623336672783, + "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 @@ -7465,13 +7465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, - "loss": 0.0526, + "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, - "routers_loss": 0.0076471962966024876, + "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 @@ -7484,13 +7484,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, - "loss": 0.0745, + "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, - "routers_loss": 0.0637628585100174, + "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 @@ -7503,13 +7503,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, - "loss": 0.0699, + "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, - "routers_loss": 0.10882514715194702, + "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 @@ -7522,13 +7522,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, - "loss": 0.056, + "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, - "routers_loss": 0.10924118757247925, + "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 @@ -7541,13 +7541,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, - "loss": 0.0664, + "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, - "routers_loss": 0.06571807712316513, + "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 @@ -7560,13 +7560,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, - "loss": 0.0616, + "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, - "routers_loss": 0.058966971933841705, + "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 @@ -7579,32 +7579,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, - "loss": 0.067, + "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, - "routers_loss": 0.04844636470079422, + "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, - "acc_skip": 0.4000000059604645, - "avg_layers": 26.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, "epoch": 3.756090402113296, - "f1_execute": 0.9166666865348816, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.5714285969734192, - "grad_norm": 0.1416015625, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, - "loss": 0.0634, - "macro_f1": 0.4960317611694336, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, - "routers_loss": 0.1591777801513672, + "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 @@ -7617,13 +7617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, - "loss": 0.0694, + "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, - "routers_loss": 0.017735568806529045, + "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 @@ -7636,13 +7636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, - "loss": 0.0477, + "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, - "routers_loss": 0.012401525862514973, + "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 @@ -7655,13 +7655,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, - "loss": 0.0735, + "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, - "routers_loss": 0.10736164450645447, + "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 @@ -7674,13 +7674,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, - "loss": 0.0428, + "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, - "routers_loss": 0.0595436617732048, + "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 @@ -7693,13 +7693,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.240234375, + "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, - "loss": 0.0494, + "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, - "routers_loss": 0.12617000937461853, + "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 @@ -7712,13 +7712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, - "loss": 0.0537, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, - "routers_loss": 0.021242506802082062, + "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 @@ -7731,13 +7731,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, - "loss": 0.0617, + "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, - "routers_loss": 0.10387415438890457, + "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 @@ -7750,13 +7750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, - "loss": 0.0565, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, - "routers_loss": 0.007816939614713192, + "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 @@ -7769,13 +7769,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, - "loss": 0.0654, + "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, - "routers_loss": 0.22526368498802185, + "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 @@ -7788,13 +7788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, - "loss": 0.056, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, - "routers_loss": 0.010998851619660854, + "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 @@ -7807,13 +7807,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, - "loss": 0.0712, + "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, - "routers_loss": 0.07115054875612259, + "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 @@ -7826,13 +7826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, - "loss": 0.0611, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, - "routers_loss": 0.008062695153057575, + "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 @@ -7845,13 +7845,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, - "loss": 0.0496, + "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, - "routers_loss": 0.16666285693645477, + "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 @@ -7864,13 +7864,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, - "loss": 0.0567, + "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, - "routers_loss": 0.0908689796924591, + "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 @@ -7883,13 +7883,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, - "loss": 0.0648, + "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, - "routers_loss": 0.04364728182554245, + "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 @@ -7897,18 +7897,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.906369239800411, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, - "loss": 0.0772, - "macro_f1": 0.3076923191547394, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, "num_tokens": 1344232.0, "repeat_count": 1.0, - "routers_loss": 0.15315109491348267, + "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 @@ -7921,13 +7921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, - "loss": 0.0527, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, - "routers_loss": 0.01342768594622612, + "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 @@ -7940,13 +7940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, - "loss": 0.0513, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, - "routers_loss": 0.01158806961029768, + "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 @@ -7959,13 +7959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, - "loss": 0.0549, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, - "routers_loss": 0.01255605649203062, + "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 @@ -7978,13 +7978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, - "loss": 0.0578, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, - "routers_loss": 0.010225459933280945, + "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 @@ -7997,13 +7997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, - "loss": 0.0633, + "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, - "routers_loss": 0.007837744429707527, + "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 @@ -8016,13 +8016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, - "loss": 0.0674, + "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, - "routers_loss": 0.008631376549601555, + "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 @@ -8035,13 +8035,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, - "loss": 0.0612, + "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, - "routers_loss": 0.06206027418375015, + "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 @@ -8049,18 +8049,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 3.9815086586439685, - "f1_execute": 0.9411765336990356, + "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.16015625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, - "loss": 0.0678, - "macro_f1": 0.480392187833786, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, - "routers_loss": 0.1463794708251953, + "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 @@ -8073,13 +8073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, - "loss": 0.052, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, - "routers_loss": 0.01140492781996727, + "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 @@ -8092,13 +8092,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, - "loss": 0.0563, + "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, - "routers_loss": 0.05136030167341232, + "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 @@ -8111,32 +8111,32 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, - "loss": 0.0627, + "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, - "routers_loss": 0.07274381071329117, + "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 4.01878485471089, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, - "loss": 0.0503, - "macro_f1": 0.3272727429866791, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, - "routers_loss": 0.024335317313671112, + "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 @@ -8149,13 +8149,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, - "loss": 0.0359, + "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, - "routers_loss": 0.046614740043878555, + "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 @@ -8168,13 +8168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, - "loss": 0.0372, + "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, - "routers_loss": 0.006380240898579359, + "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 @@ -8187,13 +8187,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, - "loss": 0.0473, + "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, - "routers_loss": 0.04770716652274132, + "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 @@ -8206,32 +8206,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, - "loss": 0.0434, + "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, - "routers_loss": 0.006983708590269089, + "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.065746991488113, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.080078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, - "loss": 0.0476, - "macro_f1": 0.32098764181137085, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, - "routers_loss": 0.046313900500535965, + "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 @@ -8244,32 +8244,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, - "routers_loss": 0.0401870422065258, + "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.084531846199002, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, - "loss": 0.0505, - "macro_f1": 0.32098764181137085, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, - "routers_loss": 0.03313451260328293, + "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 @@ -8282,13 +8282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, - "loss": 0.0468, + "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, - "routers_loss": 0.010118982754647732, + "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 @@ -8301,13 +8301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, - "loss": 0.0498, + "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, - "routers_loss": 0.005856200121343136, + "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 @@ -8320,13 +8320,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, - "loss": 0.0417, + "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, - "routers_loss": 0.02768322452902794, + "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 @@ -8339,13 +8339,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, - "loss": 0.0419, + "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, - "routers_loss": 0.09382032603025436, + "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 @@ -8358,32 +8358,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.103515625, + "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, - "loss": 0.0466, + "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, - "routers_loss": 0.026867631822824478, + "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 4.14088641033167, - "f1_execute": 0.95652174949646, + "f1_execute": 0.936170220375061, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.26171875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, - "loss": 0.0496, - "macro_f1": 0.5855072736740112, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, - "routers_loss": 0.12731307744979858, + "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 @@ -8396,13 +8396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, - "loss": 0.039, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, - "routers_loss": 0.008483970537781715, + "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 @@ -8415,13 +8415,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, - "loss": 0.0348, + "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, - "routers_loss": 0.07847871631383896, + "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 @@ -8434,13 +8434,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, - "loss": 0.036, + "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, - "routers_loss": 0.04574459046125412, + "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 @@ -8453,13 +8453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, - "loss": 0.0485, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, - "routers_loss": 0.004683624487370253, + "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 @@ -8472,13 +8472,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, - "loss": 0.0476, + "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, - "routers_loss": 0.06499828398227692, + "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 @@ -8486,18 +8486,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 4.197240974464338, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, - "loss": 0.0326, - "macro_f1": 0.44705885648727417, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, - "routers_loss": 0.08285653591156006, + "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 @@ -8510,13 +8510,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, - "loss": 0.0497, + "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, - "routers_loss": 0.04252336546778679, + "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 @@ -8524,18 +8524,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.216025829175227, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, - "loss": 0.0349, - "macro_f1": 0.31446540355682373, + "loss": 0.034, + "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, - "routers_loss": 0.126711905002594, + "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 @@ -8548,13 +8548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, - "loss": 0.0392, + "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, - "routers_loss": 0.00955706462264061, + "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 @@ -8567,13 +8567,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, - "loss": 0.0377, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, - "routers_loss": 0.03127318620681763, + "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 @@ -8586,13 +8586,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, - "loss": 0.0389, + "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, - "routers_loss": 0.06743519753217697, + "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 @@ -8605,13 +8605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, - "loss": 0.0477, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, - "routers_loss": 0.0025313226506114006, + "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 @@ -8624,13 +8624,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, - "loss": 0.052, + "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, - "routers_loss": 0.037147488445043564, + "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 @@ -8643,13 +8643,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, - "loss": 0.0501, + "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, - "routers_loss": 0.021232586354017258, + "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 @@ -8662,13 +8662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, - "loss": 0.0759, + "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, - "routers_loss": 0.010563847608864307, + "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 @@ -8681,13 +8681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, - "loss": 0.0425, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, - "routers_loss": 0.0267612524330616, + "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 @@ -8700,13 +8700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, - "loss": 0.0501, + "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, - "routers_loss": 0.005838244222104549, + "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 @@ -8719,13 +8719,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, - "loss": 0.0383, + "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, - "routers_loss": 0.014642171561717987, + "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 @@ -8738,32 +8738,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, - "loss": 0.0457, + "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, - "routers_loss": 0.04119620472192764, + "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.328734957440563, - "f1_execute": 0.943396270275116, - "f1_repeat": 0.0, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, - "loss": 0.0433, - "macro_f1": 0.3144654333591461, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, - "routers_loss": 0.06713195145130157, + "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 @@ -8776,13 +8776,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1572265625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, - "loss": 0.0533, + "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, - "routers_loss": 0.024023180827498436, + "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 @@ -8795,13 +8795,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, - "loss": 0.0373, + "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, - "routers_loss": 0.05399841442704201, + "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 @@ -8814,13 +8814,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, - "loss": 0.0488, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, - "routers_loss": 0.0299264844506979, + "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 @@ -8833,13 +8833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, - "loss": 0.0467, + "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, - "routers_loss": 0.023478010669350624, + "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 @@ -8852,13 +8852,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.1103515625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, - "loss": 0.0447, + "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, - "routers_loss": 0.12116194516420364, + "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 @@ -8871,13 +8871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, - "loss": 0.053, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, - "routers_loss": 0.011879723519086838, + "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 @@ -8890,13 +8890,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, - "loss": 0.0373, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, - "routers_loss": 0.009245929308235645, + "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 @@ -8909,13 +8909,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, - "loss": 0.0461, + "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, - "routers_loss": 0.032464127987623215, + "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 @@ -8928,13 +8928,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, - "loss": 0.0515, + "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, - "routers_loss": 0.08835392445325851, + "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 @@ -8947,13 +8947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, - "loss": 0.0405, + "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, - "routers_loss": 0.014307246543467045, + "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 @@ -8966,13 +8966,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, - "loss": 0.0449, + "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, - "routers_loss": 0.10261563211679459, + "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 @@ -8985,13 +8985,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, - "loss": 0.0507, + "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, - "routers_loss": 0.03316422924399376, + "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 @@ -9004,13 +9004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, - "loss": 0.0352, + "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, - "routers_loss": 0.00902469176799059, + "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 @@ -9023,13 +9023,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, - "loss": 0.0581, + "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, - "routers_loss": 0.06710167229175568, + "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 @@ -9042,13 +9042,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, - "loss": 0.0294, + "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, - "routers_loss": 0.04208769276738167, + "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 @@ -9061,13 +9061,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, - "loss": 0.0505, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, - "routers_loss": 0.06530380249023438, + "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 @@ -9080,13 +9080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, - "loss": 0.0545, + "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, - "routers_loss": 0.01803453080356121, + "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 @@ -9099,13 +9099,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, - "loss": 0.0481, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, - "routers_loss": 0.08461762219667435, + "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 @@ -9120,11 +9120,11 @@ "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, - "loss": 0.0441, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, - "routers_loss": 0.007111486047506332, + "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 @@ -9137,13 +9137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11181640625, + "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, - "loss": 0.0361, + "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, - "routers_loss": 0.029776185750961304, + "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 @@ -9156,13 +9156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, - "loss": 0.0506, + "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, - "routers_loss": 0.007108999416232109, + "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 @@ -9175,13 +9175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, - "loss": 0.0498, + "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, - "routers_loss": 0.01126947533339262, + "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 @@ -9194,13 +9194,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, - "loss": 0.0366, + "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, - "routers_loss": 0.05142999067902565, + "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 @@ -9213,13 +9213,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, - "loss": 0.0461, + "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, - "routers_loss": 0.010770819149911404, + "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 @@ -9232,13 +9232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, - "loss": 0.0692, + "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, - "routers_loss": 0.008775795809924603, + "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 @@ -9251,13 +9251,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, - "loss": 0.0403, + "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, - "routers_loss": 0.05100395902991295, + "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 @@ -9270,13 +9270,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25390625, + "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, - "loss": 0.0503, + "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, - "routers_loss": 0.06653711199760437, + "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 @@ -9284,18 +9284,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.591722923393014, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, - "loss": 0.0435, - "macro_f1": 0.3333333432674408, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, - "routers_loss": 0.009865665808320045, + "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 @@ -9308,13 +9308,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.130859375, + "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, - "loss": 0.0399, + "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, - "routers_loss": 0.021175632253289223, + "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 @@ -9327,32 +9327,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, - "loss": 0.0472, + "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, - "routers_loss": 0.011803832836449146, + "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.800000011920929, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 4.6199002054593485, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.0, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, - "grad_norm": 0.142578125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, - "loss": 0.0467, - "macro_f1": 0.5696970224380493, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, - "routers_loss": 0.08856551349163055, + "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 @@ -9365,13 +9365,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, - "loss": 0.0413, + "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, - "routers_loss": 0.08593414723873138, + "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 @@ -9384,13 +9384,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, - "loss": 0.0399, + "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, - "routers_loss": 0.07049509882926941, + "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 @@ -9403,13 +9403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, - "loss": 0.0595, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, - "routers_loss": 0.0019258069805800915, + "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 @@ -9422,13 +9422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, - "loss": 0.0335, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, - "routers_loss": 0.014094089157879353, + "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 @@ -9436,18 +9436,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 31.0, "epoch": 4.666862342236572, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, - "loss": 0.0603, - "macro_f1": 0.6527777910232544, + "loss": 0.06, + "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, - "routers_loss": 0.06360147893428802, + "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 @@ -9460,13 +9460,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10546875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, - "loss": 0.0573, + "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, - "routers_loss": 0.0326208658516407, + "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 @@ -9479,13 +9479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, - "loss": 0.0502, + "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, - "routers_loss": 0.012660752050578594, + "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 @@ -9498,13 +9498,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, - "loss": 0.0537, + "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, - "routers_loss": 0.021756287664175034, + "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 @@ -9512,37 +9512,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.70443205165835, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, - "loss": 0.0447, - "macro_f1": 0.31446540355682373, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, - "routers_loss": 0.07292548567056656, + "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { - "acc_repeat": 0.5, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 4.713824479013795, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, - "loss": 0.0505, - "macro_f1": 0.5492662787437439, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, - "routers_loss": 0.03397528454661369, + "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 @@ -9555,13 +9555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, - "loss": 0.0544, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, - "routers_loss": 0.005987613927572966, + "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 @@ -9574,13 +9574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, - "loss": 0.0522, + "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, - "routers_loss": 0.021656684577465057, + "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 @@ -9593,32 +9593,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, - "loss": 0.0487, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, - "routers_loss": 0.0014992234064266086, + "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.751394188435574, - "f1_execute": 0.9411764740943909, - "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, - "loss": 0.0491, - "macro_f1": 0.5359477400779724, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, - "routers_loss": 0.03448791801929474, + "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 @@ -9631,13 +9631,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, - "loss": 0.0541, + "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, - "routers_loss": 0.08285929262638092, + "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 @@ -9650,13 +9650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, - "loss": 0.0515, + "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, - "routers_loss": 0.00486504752188921, + "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 @@ -9669,13 +9669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, - "loss": 0.051, + "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, - "routers_loss": 0.017805909737944603, + "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 @@ -9688,13 +9688,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, - "loss": 0.0547, + "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, - "routers_loss": 0.12958799302577972, + "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 @@ -9707,13 +9707,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1533203125, + "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, - "loss": 0.0595, + "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, - "routers_loss": 0.07447081059217453, + "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 @@ -9726,13 +9726,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, - "loss": 0.0619, + "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, - "routers_loss": 0.12529553472995758, + "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 @@ -9745,13 +9745,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, - "loss": 0.0491, + "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, - "routers_loss": 0.08664281666278839, + "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 @@ -9764,13 +9764,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, - "loss": 0.0394, + "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, - "routers_loss": 0.0067965323105454445, + "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 @@ -9783,13 +9783,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, - "loss": 0.0607, + "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, - "routers_loss": 0.08319470286369324, + "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 @@ -9802,13 +9802,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, - "loss": 0.0476, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, - "routers_loss": 0.002826537238433957, + "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 @@ -9823,11 +9823,11 @@ "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, - "loss": 0.0431, + "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, - "routers_loss": 0.03491095453500748, + "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 @@ -9840,13 +9840,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, - "loss": 0.0616, + "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, - "routers_loss": 0.1828247457742691, + "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 @@ -9859,13 +9859,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, - "loss": 0.0563, + "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, - "routers_loss": 0.010931054130196571, + "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 @@ -9878,13 +9878,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, - "loss": 0.0569, + "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, - "routers_loss": 0.023222090676426888, + "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 @@ -9897,13 +9897,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, - "loss": 0.0453, + "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, - "routers_loss": 0.07085686922073364, + "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 @@ -9916,13 +9916,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, - "loss": 0.0515, + "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, - "routers_loss": 0.010158216580748558, + "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 @@ -9935,13 +9935,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, - "loss": 0.0372, + "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, - "routers_loss": 0.007876895368099213, + "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 @@ -9954,13 +9954,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, - "loss": 0.0501, + "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, - "routers_loss": 0.004859173204749823, + "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 @@ -9973,13 +9973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, - "loss": 0.0582, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, - "routers_loss": 0.01798083633184433, + "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 @@ -9987,18 +9987,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.939242735544467, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, - "macro_f1": 0.3272727429866791, + "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, - "routers_loss": 0.014295363798737526, + "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 @@ -10011,13 +10011,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, - "loss": 0.0635, + "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, - "routers_loss": 0.1038367822766304, + "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 @@ -10030,13 +10030,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, - "loss": 0.057, + "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, - "routers_loss": 0.029780643060803413, + "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 @@ -10049,13 +10049,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, - "loss": 0.0398, + "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, - "routers_loss": 0.008537676185369492, + "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 @@ -10068,13 +10068,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, - "loss": 0.0617, + "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, - "routers_loss": 0.03966755419969559, + "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 @@ -10087,13 +10087,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, - "loss": 0.0311, + "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, - "routers_loss": 0.06651833653450012, + "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 @@ -10106,13 +10106,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2431640625, + "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, - "loss": 0.0557, + "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, - "routers_loss": 0.05130369961261749, + "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 @@ -10125,13 +10125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, - "loss": 0.0435, + "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, - "routers_loss": 0.020534176379442215, + "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 @@ -10144,13 +10144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, - "loss": 0.0305, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, - "routers_loss": 0.007514918688684702, + "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 @@ -10163,13 +10163,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, - "loss": 0.0461, + "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, - "routers_loss": 0.024598751217126846, + "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 @@ -10182,32 +10182,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, - "loss": 0.0408, + "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, - "routers_loss": 0.011866633780300617, + "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.042265923099501, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, - "loss": 0.036, - "macro_f1": 0.3272727429866791, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, - "routers_loss": 0.01755746826529503, + "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 @@ -10220,13 +10220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, - "loss": 0.0415, + "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, - "routers_loss": 0.003976983483880758, + "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 @@ -10239,13 +10239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, - "loss": 0.0378, + "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, - "routers_loss": 0.013548235408961773, + "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 @@ -10258,13 +10258,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, - "routers_loss": 0.03923165053129196, + "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 @@ -10277,13 +10277,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.1923828125, + "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, - "loss": 0.0464, + "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, - "routers_loss": 0.046429626643657684, + "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 @@ -10296,13 +10296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, - "loss": 0.0346, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, - "routers_loss": 0.008998732082545757, + "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 @@ -10315,13 +10315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, - "loss": 0.0386, + "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, - "routers_loss": 0.005173585377633572, + "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 @@ -10334,13 +10334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, - "loss": 0.0308, + "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, - "routers_loss": 0.024098891764879227, + "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 @@ -10353,13 +10353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, - "routers_loss": 0.014002764597535133, + "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 @@ -10372,32 +10372,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.14453125, + "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, - "loss": 0.0462, + "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, - "routers_loss": 0.017871137708425522, + "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.136190196653947, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, - "macro_f1": 0.32098764181137085, + "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, - "routers_loss": 0.033123619854450226, + "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 @@ -10410,13 +10410,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, - "loss": 0.0267, + "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, - "routers_loss": 0.01279227901250124, + "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 @@ -10429,13 +10429,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, - "loss": 0.0295, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, - "routers_loss": 0.001354650012217462, + "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 @@ -10448,13 +10448,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, - "loss": 0.0326, + "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, - "routers_loss": 0.1097714751958847, + "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 @@ -10467,13 +10467,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, - "loss": 0.0187, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, - "routers_loss": 0.009649592451751232, + "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 @@ -10486,13 +10486,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11083984375, + "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, - "loss": 0.0447, + "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, - "routers_loss": 0.020858706906437874, + "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 @@ -10505,13 +10505,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, - "loss": 0.0428, + "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, - "routers_loss": 0.048101235181093216, + "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 @@ -10524,13 +10524,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, - "loss": 0.0435, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, - "routers_loss": 0.02875671721994877, + "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 @@ -10543,13 +10543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, - "loss": 0.0247, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, - "routers_loss": 0.019005145877599716, + "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 @@ -10562,13 +10562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, - "loss": 0.0393, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, - "routers_loss": 0.007238700054585934, + "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 @@ -10581,13 +10581,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, - "routers_loss": 0.12206140905618668, + "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 @@ -10595,18 +10595,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 5.239506897563839, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, - "loss": 0.0366, - "macro_f1": 0.29333335161209106, + "loss": 0.035, + "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, - "routers_loss": 0.15721899271011353, + "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 @@ -10619,32 +10619,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.125, + "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, - "loss": 0.0366, + "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, - "routers_loss": 0.05058665946125984, + "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 5.258291752274729, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "f1_skip": 1.0, + "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, - "loss": 0.0454, - "macro_f1": 0.32098764181137085, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, - "routers_loss": 0.023021472617983818, + "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 @@ -10657,13 +10657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, - "loss": 0.0409, + "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, - "routers_loss": 0.006660689599812031, + "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 @@ -10676,13 +10676,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.146484375, + "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, - "loss": 0.0547, + "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, - "routers_loss": 0.031727343797683716, + "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 @@ -10695,13 +10695,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, - "loss": 0.0351, + "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, - "routers_loss": 0.06140992045402527, + "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 @@ -10709,18 +10709,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 5.295861461696507, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, - "loss": 0.0436, - "macro_f1": 0.44705885648727417, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, - "routers_loss": 0.03872275352478027, + "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 @@ -10728,18 +10728,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.305253889051952, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, - "loss": 0.0353, - "macro_f1": 0.3272727429866791, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, - "routers_loss": 0.031013142317533493, + "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 @@ -10752,13 +10752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, - "loss": 0.0333, + "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, - "routers_loss": 0.004357635974884033, + "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 @@ -10771,13 +10771,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11279296875, + "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, - "routers_loss": 0.2376353144645691, + "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 @@ -10790,13 +10790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, - "loss": 0.0446, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, - "routers_loss": 0.004978097043931484, + "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 @@ -10809,13 +10809,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, - "loss": 0.0309, + "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, - "routers_loss": 0.14195409417152405, + "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 @@ -10828,13 +10828,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.15234375, + "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, - "loss": 0.0403, + "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, - "routers_loss": 0.04005253314971924, + "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 @@ -10847,13 +10847,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, - "routers_loss": 0.006839688867330551, + "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 @@ -10866,13 +10866,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, - "loss": 0.0396, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, - "routers_loss": 0.10174567997455597, + "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 @@ -10885,13 +10885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, - "loss": 0.0365, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, - "routers_loss": 0.014655748382210732, + "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 @@ -10904,13 +10904,13 @@ "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, - "loss": 0.0467, + "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, - "routers_loss": 0.16960746049880981, + "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 @@ -10923,32 +10923,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, - "loss": 0.0399, + "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, - "routers_loss": 0.011591178365051746, + "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { - "acc_repeat": 0.5, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 30.0, "epoch": 5.408570589961843, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, - "loss": 0.0332, - "macro_f1": 0.5492662787437439, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, - "routers_loss": 0.04036068916320801, + "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 @@ -10961,13 +10961,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, - "loss": 0.0366, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, - "routers_loss": 0.030165785923600197, + "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 @@ -10980,13 +10980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, - "loss": 0.0421, + "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, - "routers_loss": 0.003615695284679532, + "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 @@ -10999,13 +10999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, - "loss": 0.0377, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, - "routers_loss": 0.01679840311408043, + "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 @@ -11018,13 +11018,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, - "loss": 0.0422, + "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, - "routers_loss": 0.024936161935329437, + "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 @@ -11037,13 +11037,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, - "loss": 0.0407, + "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, - "routers_loss": 0.06472968310117722, + "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 @@ -11056,13 +11056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, - "loss": 0.031, + "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, - "routers_loss": 0.009633494541049004, + "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 @@ -11075,13 +11075,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, - "routers_loss": 0.01458993274718523, + "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 @@ -11094,13 +11094,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, - "loss": 0.0389, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, - "routers_loss": 0.06636982411146164, + "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 @@ -11113,13 +11113,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, - "loss": 0.0325, + "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, - "routers_loss": 0.005644182674586773, + "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 @@ -11132,13 +11132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, - "loss": 0.0402, + "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, - "routers_loss": 0.010273848660290241, + "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 @@ -11151,13 +11151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, - "loss": 0.0415, + "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, - "routers_loss": 0.004529652185738087, + "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 @@ -11170,13 +11170,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, - "loss": 0.045, + "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, - "routers_loss": 0.024671228602528572, + "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 @@ -11189,13 +11189,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, - "loss": 0.0354, + "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, - "routers_loss": 0.06466450542211533, + "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 @@ -11208,13 +11208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, - "loss": 0.0278, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, - "routers_loss": 0.010566026903688908, + "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 @@ -11227,13 +11227,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, - "loss": 0.037, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, - "routers_loss": 0.013842248357832432, + "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 @@ -11246,13 +11246,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07373046875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, - "loss": 0.0215, + "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, - "routers_loss": 0.122759610414505, + "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 @@ -11265,32 +11265,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, - "loss": 0.0402, + "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, - "routers_loss": 0.035315629094839096, + "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.577634282359847, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, - "loss": 0.034, - "macro_f1": 0.32098764181137085, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, - "routers_loss": 0.040048226714134216, + "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 @@ -11298,18 +11298,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.587026709715292, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, - "loss": 0.044, - "macro_f1": 0.3333333432674408, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, - "routers_loss": 0.012814820744097233, + "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 @@ -11322,13 +11322,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, - "loss": 0.0407, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, - "routers_loss": 0.05977959558367729, + "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 @@ -11341,13 +11341,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.1533203125, + "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, - "loss": 0.0334, + "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, - "routers_loss": 0.031448643654584885, + "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 @@ -11360,13 +11360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, - "loss": 0.0523, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, - "routers_loss": 0.008164947852492332, + "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 @@ -11379,32 +11379,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, - "loss": 0.0478, + "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, - "routers_loss": 0.04045635461807251, + "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 5.633988846492516, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.1240234375, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, - "loss": 0.0447, - "macro_f1": 0.5866667032241821, + "loss": 0.0441, + "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, - "routers_loss": 0.06872973591089249, + "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 @@ -11412,18 +11412,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 5.64338127384796, - "f1_execute": 0.8695651888847351, + "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.25390625, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, - "loss": 0.0331, - "macro_f1": 0.4231884181499481, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, - "routers_loss": 0.20964151620864868, + "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 @@ -11442,26 +11442,26 @@ "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, - "routers_loss": 0.00690250750631094, + "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.66216612855885, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.14453125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, - "loss": 0.0372, - "macro_f1": 0.32098764181137085, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, - "routers_loss": 0.022315658628940582, + "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 @@ -11474,13 +11474,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1083984375, + "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, - "loss": 0.0388, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, - "routers_loss": 0.017015069723129272, + "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 @@ -11493,13 +11493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, - "loss": 0.0372, + "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, - "routers_loss": 0.006532609928399324, + "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 @@ -11512,13 +11512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, - "loss": 0.0301, + "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, - "routers_loss": 0.009720963425934315, + "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 @@ -11531,32 +11531,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, - "loss": 0.046, + "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, - "routers_loss": 0.03176085278391838, + "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, - "acc_skip": 0.5, - "avg_layers": 29.0, + "acc_skip": 0.0, + "avg_layers": 30.0, "epoch": 5.709128265336073, - "f1_execute": 0.9387754797935486, + "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.2021484375, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, - "loss": 0.0323, - "macro_f1": 0.7018141150474548, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, - "routers_loss": 0.08626245707273483, + "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 @@ -11569,13 +11569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, - "loss": 0.0374, + "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, - "routers_loss": 0.012099343352019787, + "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 @@ -11588,13 +11588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, - "loss": 0.0342, + "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, - "routers_loss": 0.007713846862316132, + "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 @@ -11607,13 +11607,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, - "loss": 0.0499, + "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, - "routers_loss": 0.004629489034414291, + "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 @@ -11626,13 +11626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, - "loss": 0.035, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, - "routers_loss": 0.010654795914888382, + "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 @@ -11640,18 +11640,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 5.756090402113296, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, - "loss": 0.0516, - "macro_f1": 0.31446540355682373, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, - "routers_loss": 0.05963074415922165, + "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 @@ -11664,13 +11664,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, - "loss": 0.04, + "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, - "routers_loss": 0.028920643031597137, + "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 @@ -11678,18 +11678,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.774875256824186, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, - "loss": 0.0327, - "macro_f1": 0.3333333432674408, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, - "routers_loss": 0.006852717138826847, + "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 @@ -11702,13 +11702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, - "loss": 0.0346, + "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, - "routers_loss": 0.010968753136694431, + "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 @@ -11721,13 +11721,13 @@ "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.1240234375, + "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, - "loss": 0.0344, + "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, - "routers_loss": 0.0906950980424881, + "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 @@ -11740,13 +11740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, - "loss": 0.0484, + "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, - "routers_loss": 0.016306072473526, + "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 @@ -11759,13 +11759,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.208984375, + "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, - "loss": 0.038, + "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, - "routers_loss": 0.05804471671581268, + "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 @@ -11773,18 +11773,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 5.821837393601409, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.091796875, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, - "loss": 0.0256, - "macro_f1": 0.5492662787437439, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, - "routers_loss": 0.045395996421575546, + "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 @@ -11797,13 +11797,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11767578125, + "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, - "loss": 0.0438, + "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, - "routers_loss": 0.020834850147366524, + "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 @@ -11816,13 +11816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, - "loss": 0.0377, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, - "routers_loss": 0.005241698585450649, + "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 @@ -11835,13 +11835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, - "loss": 0.038, + "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, - "routers_loss": 0.008387803100049496, + "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 @@ -11854,13 +11854,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, - "loss": 0.0256, + "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, - "routers_loss": 0.02261745184659958, + "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 @@ -11873,32 +11873,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.10693359375, + "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, - "loss": 0.0435, + "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, - "routers_loss": 0.21678555011749268, + "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.878191957734077, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, - "loss": 0.0358, - "macro_f1": 0.32098764181137085, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, - "routers_loss": 0.06554054468870163, + "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 @@ -11911,13 +11911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, - "loss": 0.0337, + "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, - "routers_loss": 0.007237187586724758, + "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 @@ -11930,13 +11930,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, - "loss": 0.0387, + "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, - "routers_loss": 0.08003793656826019, + "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 @@ -11949,13 +11949,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, - "loss": 0.0341, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, - "routers_loss": 0.006424390245229006, + "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 @@ -11968,13 +11968,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, - "loss": 0.0482, + "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, - "routers_loss": 0.10797854512929916, + "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 @@ -11987,13 +11987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, - "loss": 0.053, + "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, - "routers_loss": 0.006734046153724194, + "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 @@ -12006,13 +12006,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, - "loss": 0.0373, + "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, - "routers_loss": 0.00564212491735816, + "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 @@ -12025,13 +12025,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, - "loss": 0.0472, + "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, - "routers_loss": 0.026137735694646835, + "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 @@ -12044,13 +12044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, - "loss": 0.0343, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, - "routers_loss": 0.0069669694639742374, + "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 @@ -12063,13 +12063,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1142578125, + "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, - "loss": 0.0323, + "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, - "routers_loss": 0.05618409812450409, + "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 @@ -12084,11 +12084,11 @@ "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, - "loss": 0.0391, + "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, - "routers_loss": 0.025900620967149734, + "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 @@ -12101,13 +12101,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, - "loss": 0.0426, + "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, - "routers_loss": 0.006236737594008446, + "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 @@ -12120,13 +12120,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, - "loss": 0.0503, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, - "routers_loss": 0.006367063149809837, + "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 @@ -12139,13 +12139,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, - "loss": 0.0442, + "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, - "routers_loss": 0.003392914542928338, + "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 @@ -12158,13 +12158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, - "loss": 0.0231, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, - "routers_loss": 0.007376343477517366, + "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 @@ -12177,13 +12177,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, - "loss": 0.0243, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, - "routers_loss": 0.02773376554250717, + "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 @@ -12196,13 +12196,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, - "loss": 0.0363, + "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, - "routers_loss": 0.03535797819495201, + "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 @@ -12215,13 +12215,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, - "loss": 0.0269, + "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, - "routers_loss": 0.004910993855446577, + "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 @@ -12234,13 +12234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, - "loss": 0.0301, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, - "routers_loss": 0.0032444109674543142, + "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 @@ -12253,13 +12253,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.0947265625, + "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, - "loss": 0.0272, + "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, - "routers_loss": 0.12451831251382828, + "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 @@ -12272,13 +12272,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, - "loss": 0.0289, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, - "routers_loss": 0.011074979789555073, + "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 @@ -12291,13 +12291,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1806640625, + "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, - "loss": 0.0336, + "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, - "routers_loss": 0.01774786226451397, + "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 @@ -12310,13 +12310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, - "routers_loss": 0.0052874404937028885, + "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 @@ -12329,13 +12329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, - "loss": 0.031, + "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, - "routers_loss": 0.0034106262028217316, + "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 @@ -12348,13 +12348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2177734375, + "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, - "routers_loss": 0.010383229702711105, + "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 @@ -12367,13 +12367,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, - "loss": 0.0304, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, - "routers_loss": 0.0076674893498420715, + "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 @@ -12386,13 +12386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, - "loss": 0.0287, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, - "routers_loss": 0.010486516170203686, + "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 @@ -12405,13 +12405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, - "loss": 0.0237, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, - "routers_loss": 0.0023783023934811354, + "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 @@ -12424,13 +12424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, - "loss": 0.044, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, - "routers_loss": 0.006714595016092062, + "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 @@ -12443,13 +12443,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, - "loss": 0.0281, + "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, - "routers_loss": 0.022373953834176064, + "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 @@ -12462,13 +12462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, - "loss": 0.0205, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, - "routers_loss": 0.007452849764376879, + "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 @@ -12481,13 +12481,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, - "routers_loss": 0.016413308680057526, + "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 @@ -12500,13 +12500,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, - "loss": 0.0243, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, - "routers_loss": 0.004676427226513624, + "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 @@ -12519,13 +12519,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.123046875, + "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, - "loss": 0.0284, + "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, - "routers_loss": 0.024454625323414803, + "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 @@ -12538,13 +12538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, - "loss": 0.022, + "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, - "routers_loss": 0.013495884835720062, + "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 @@ -12557,13 +12557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, - "loss": 0.0213, + "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, - "routers_loss": 0.006397814955562353, + "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 @@ -12576,13 +12576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, - "loss": 0.0246, + "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, - "routers_loss": 0.00503434706479311, + "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 @@ -12595,13 +12595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, - "loss": 0.0402, + "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, - "routers_loss": 0.005150494631379843, + "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 @@ -12614,13 +12614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, - "loss": 0.041, + "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, - "routers_loss": 0.004570818971842527, + "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 @@ -12633,13 +12633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, - "loss": 0.0314, + "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, - "routers_loss": 0.005299333017319441, + "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 @@ -12652,13 +12652,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.111328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, - "loss": 0.0303, + "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, - "routers_loss": 0.168672576546669, + "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 @@ -12671,32 +12671,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, - "loss": 0.0302, + "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, - "routers_loss": 0.05187409743666649, + "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, "epoch": 6.272380393307896, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.9090909361839294, - "grad_norm": 0.1669921875, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, - "loss": 0.0339, - "macro_f1": 0.8329448699951172, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, - "routers_loss": 0.05786697566509247, + "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 @@ -12709,13 +12709,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, - "loss": 0.0315, + "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, - "routers_loss": 0.017055779695510864, + "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 @@ -12728,13 +12728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, - "loss": 0.0249, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, - "routers_loss": 0.011614206247031689, + "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 @@ -12747,13 +12747,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, - "loss": 0.033, + "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, - "routers_loss": 0.012611300684511662, + "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 @@ -12766,13 +12766,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, - "loss": 0.0315, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, - "routers_loss": 0.018757276237010956, + "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 @@ -12785,13 +12785,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, - "loss": 0.0229, + "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, - "routers_loss": 0.08197146654129028, + "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 @@ -12804,13 +12804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, - "loss": 0.0256, + "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, - "routers_loss": 0.014122758992016315, + "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 @@ -12823,13 +12823,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, - "loss": 0.0221, + "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, - "routers_loss": 0.08215996623039246, + "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 @@ -12842,13 +12842,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, - "loss": 0.0312, + "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, - "routers_loss": 0.026304977014660835, + "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 @@ -12861,13 +12861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, - "loss": 0.0302, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, - "routers_loss": 0.003616038942709565, + "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 @@ -12880,13 +12880,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0849609375, + "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, - "loss": 0.0323, + "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, - "routers_loss": 0.060399893671274185, + "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 @@ -12899,13 +12899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, - "loss": 0.0384, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, - "routers_loss": 0.007164204493165016, + "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 @@ -12918,13 +12918,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, - "loss": 0.0343, + "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, - "routers_loss": 0.010965060442686081, + "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 @@ -12937,13 +12937,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, - "loss": 0.0276, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, - "routers_loss": 0.00784136913716793, + "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 @@ -12956,13 +12956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, - "loss": 0.0251, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, - "routers_loss": 0.009101065807044506, + "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 @@ -12970,37 +12970,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 6.413266803639566, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, - "loss": 0.0206, - "macro_f1": 0.31446540355682373, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, - "routers_loss": 0.05967295169830322, + "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.25, + "avg_layers": 27.0, "epoch": 6.42265923099501, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1875, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, - "loss": 0.0356, - "macro_f1": 0.542222261428833, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, - "routers_loss": 0.05016552656888962, + "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 @@ -13013,13 +13013,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, - "loss": 0.038, + "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, - "routers_loss": 0.02483060024678707, + "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 @@ -13032,13 +13032,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, - "loss": 0.0373, + "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, - "routers_loss": 0.01982821337878704, + "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 @@ -13051,13 +13051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, - "loss": 0.0353, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, - "routers_loss": 0.004753436427563429, + "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 @@ -13070,13 +13070,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, - "loss": 0.0246, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, - "routers_loss": 0.06541594862937927, + "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 @@ -13089,13 +13089,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, - "loss": 0.0376, + "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, - "routers_loss": 0.01156456395983696, + "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 @@ -13108,13 +13108,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2392578125, + "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, - "loss": 0.033, + "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, - "routers_loss": 0.05175521597266197, + "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 @@ -13127,13 +13127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, - "loss": 0.0251, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, - "routers_loss": 0.002684591803699732, + "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 @@ -13146,13 +13146,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, - "loss": 0.0397, + "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, - "routers_loss": 0.054509978741407394, + "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 @@ -13165,13 +13165,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, - "routers_loss": 0.04031623527407646, + "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 @@ -13184,13 +13184,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, - "loss": 0.0308, + "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, - "routers_loss": 0.039687711745500565, + "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 @@ -13203,13 +13203,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, - "loss": 0.0329, + "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, - "routers_loss": 0.04785723611712456, + "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 @@ -13222,13 +13222,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, - "loss": 0.0359, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, - "routers_loss": 0.046765491366386414, + "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 @@ -13241,13 +13241,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, - "loss": 0.0303, + "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, - "routers_loss": 0.015151665546000004, + "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 @@ -13260,13 +13260,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, - "loss": 0.0321, + "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, - "routers_loss": 0.04431106895208359, + "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 @@ -13279,13 +13279,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, - "loss": 0.0317, + "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, - "routers_loss": 0.012700649909675121, + "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 @@ -13298,13 +13298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, - "loss": 0.0256, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, - "routers_loss": 0.005919010378420353, + "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 @@ -13317,13 +13317,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, - "loss": 0.0372, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, - "routers_loss": 0.04717765748500824, + "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 @@ -13336,13 +13336,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, - "routers_loss": 0.015415813773870468, + "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 @@ -13355,13 +13355,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, - "routers_loss": 0.06812979280948639, + "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 @@ -13374,13 +13374,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05908203125, + "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, - "loss": 0.0265, + "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, - "routers_loss": 0.025383235886693, + "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 @@ -13393,13 +13393,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, - "loss": 0.0367, + "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, - "routers_loss": 0.026493225246667862, + "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 @@ -13412,13 +13412,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, - "loss": 0.0342, + "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, - "routers_loss": 0.006616846192628145, + "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 @@ -13431,32 +13431,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0888671875, + "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, - "loss": 0.0328, + "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, - "routers_loss": 0.060631848871707916, + "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.648077487525683, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, - "loss": 0.0317, - "macro_f1": 0.6666666865348816, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, - "routers_loss": 0.011199389584362507, + "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 @@ -13469,13 +13469,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, - "loss": 0.0192, + "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, - "routers_loss": 0.01120354700833559, + "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 @@ -13488,13 +13488,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, - "loss": 0.0294, + "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, - "routers_loss": 0.030204148963093758, + "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 @@ -13507,13 +13507,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, - "loss": 0.0348, + "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, - "routers_loss": 0.008244800381362438, + "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 @@ -13526,32 +13526,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, - "loss": 0.0269, + "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, - "routers_loss": 0.015340043231844902, + "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, - "acc_skip": 0.6000000238418579, - "avg_layers": 25.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, "epoch": 6.695039624302906, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, - "f1_skip": 0.75, - "grad_norm": 0.1318359375, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, - "loss": 0.0341, - "macro_f1": 0.5694444179534912, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, - "routers_loss": 0.058681465685367584, + "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 @@ -13564,32 +13564,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, - "loss": 0.0423, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, - "routers_loss": 0.020810559391975403, + "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 6.713824479013795, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.09033203125, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, - "loss": 0.0268, - "macro_f1": 0.5492662787437439, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, - "routers_loss": 0.030001837760210037, + "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 @@ -13602,13 +13602,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, - "loss": 0.034, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, - "routers_loss": 0.010381575673818588, + "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 @@ -13621,32 +13621,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, - "loss": 0.0365, + "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, - "routers_loss": 0.03234704211354256, + "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.742001761080129, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, - "loss": 0.0303, - "macro_f1": 0.6666666865348816, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, - "routers_loss": 0.015481291338801384, + "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 @@ -13659,13 +13659,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, - "loss": 0.0366, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, - "routers_loss": 0.018170451745390892, + "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 @@ -13678,13 +13678,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, - "loss": 0.0306, + "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, - "routers_loss": 0.010385681875050068, + "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 @@ -13697,13 +13697,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, - "loss": 0.0407, + "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, - "routers_loss": 0.04976538568735123, + "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 @@ -13716,13 +13716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, - "loss": 0.0255, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, - "routers_loss": 0.0030266831163316965, + "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 @@ -13730,18 +13730,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 6.788963897857353, - "f1_execute": 0.9600000381469727, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.107421875, + "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, - "loss": 0.0411, - "macro_f1": 0.8200000524520874, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, - "routers_loss": 0.13420957326889038, + "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 @@ -13754,13 +13754,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1953125, + "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, - "loss": 0.0251, + "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, - "routers_loss": 0.012779864482581615, + "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 @@ -13773,13 +13773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, - "loss": 0.0266, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, - "routers_loss": 0.005545398220419884, + "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 @@ -13792,13 +13792,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, - "loss": 0.0395, + "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, - "routers_loss": 0.0899835154414177, + "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 @@ -13811,13 +13811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, - "routers_loss": 0.002738836221396923, + "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 @@ -13830,13 +13830,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, - "loss": 0.0266, + "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, - "routers_loss": 0.020522192120552063, + "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 @@ -13844,18 +13844,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.84531846199002, - "f1_execute": 0.9629629850387573, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, - "loss": 0.0197, - "macro_f1": 0.32098767161369324, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, - "routers_loss": 0.04245268926024437, + "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 @@ -13868,13 +13868,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, - "loss": 0.0376, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, - "routers_loss": 0.009142685681581497, + "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 @@ -13887,13 +13887,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, - "loss": 0.0295, + "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, - "routers_loss": 0.08038893342018127, + "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 @@ -13906,13 +13906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, - "loss": 0.027, + "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, - "routers_loss": 0.02107175625860691, + "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 @@ -13925,13 +13925,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, - "loss": 0.0349, + "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, - "routers_loss": 0.042030055075883865, + "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 @@ -13944,13 +13944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, - "loss": 0.022, + "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, - "routers_loss": 0.004230673424899578, + "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 @@ -13963,13 +13963,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, - "loss": 0.0284, + "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, - "routers_loss": 0.06986775249242783, + "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 @@ -13982,13 +13982,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, - "loss": 0.0431, + "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, - "routers_loss": 0.0439564548432827, + "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 @@ -14001,13 +14001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, - "loss": 0.0376, + "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, - "routers_loss": 0.011889892630279064, + "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 @@ -14020,13 +14020,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, - "loss": 0.0295, + "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, - "routers_loss": 0.022536326199769974, + "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 @@ -14039,13 +14039,13 @@ "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, - "loss": 0.0388, + "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, - "routers_loss": 0.07959948480129242, + "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 @@ -14058,13 +14058,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, - "loss": 0.0341, + "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, - "routers_loss": 0.019112225621938705, + "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 @@ -14077,13 +14077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, - "loss": 0.0345, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, - "routers_loss": 0.018750866875052452, + "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 @@ -14096,32 +14096,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, - "loss": 0.0291, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, - "routers_loss": 0.015407348051667213, + "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { - "acc_repeat": 0.6666666865348816, + "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 29.0, "epoch": 6.976812444966246, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.800000011920929, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, - "loss": 0.0258, - "macro_f1": 0.5934640765190125, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, - "routers_loss": 0.06514479219913483, + "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 @@ -14134,13 +14134,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, - "loss": 0.0217, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, - "routers_loss": 0.013567833229899406, + "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 @@ -14153,13 +14153,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, - "loss": 0.0389, + "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, - "routers_loss": 0.13762018084526062, + "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 @@ -14172,13 +14172,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, - "loss": 0.0214, + "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, - "routers_loss": 0.008640666492283344, + "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 @@ -14191,13 +14191,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, - "routers_loss": 0.0018257038900628686, + "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 @@ -14210,13 +14210,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, - "routers_loss": 0.003656312357634306, + "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 @@ -14229,13 +14229,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, - "loss": 0.0246, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, - "routers_loss": 0.044268425554037094, + "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 @@ -14248,13 +14248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, - "loss": 0.0207, + "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, - "routers_loss": 0.0018966116476804018, + "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 @@ -14267,32 +14267,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, - "loss": 0.0249, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, - "routers_loss": 0.01729201152920723, + "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 7.06105077781039, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.11962890625, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, - "loss": 0.0248, - "macro_f1": 0.5492662787437439, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, - "routers_loss": 0.01693531684577465, + "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 @@ -14305,13 +14305,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, - "loss": 0.0197, + "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, - "routers_loss": 0.04939094930887222, + "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 @@ -14324,13 +14324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, - "loss": 0.0213, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, - "routers_loss": 0.0016892930725589395, + "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 @@ -14343,13 +14343,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, - "loss": 0.0147, + "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, - "routers_loss": 0.008671467192471027, + "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 @@ -14362,13 +14362,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, - "loss": 0.0228, + "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, - "routers_loss": 0.042682576924562454, + "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 @@ -14381,13 +14381,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, - "loss": 0.028, + "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, - "routers_loss": 0.02905822917819023, + "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 @@ -14400,13 +14400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, - "loss": 0.0223, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, - "routers_loss": 0.0018810008186846972, + "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 @@ -14419,13 +14419,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, - "loss": 0.0219, + "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, - "routers_loss": 0.04308714717626572, + "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 @@ -14438,13 +14438,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, - "loss": 0.0232, + "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, - "routers_loss": 0.003754811594262719, + "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 @@ -14457,32 +14457,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, - "loss": 0.0246, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, - "routers_loss": 0.010853761807084084, + "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 7.154975051364837, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, - "loss": 0.0196, - "macro_f1": 1.0, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, - "routers_loss": 0.015100379474461079, + "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 @@ -14495,13 +14495,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, - "loss": 0.0189, + "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, - "routers_loss": 0.011991916224360466, + "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 @@ -14514,32 +14514,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, - "loss": 0.0226, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, - "routers_loss": 0.008201062679290771, + "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.183152333431171, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1181640625, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, - "loss": 0.0174, - "macro_f1": 0.6666666865348816, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, - "routers_loss": 0.008513177745044231, + "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 @@ -14552,13 +14552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, - "loss": 0.02, + "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, - "routers_loss": 0.004850955214351416, + "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 @@ -14571,32 +14571,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.087890625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, - "loss": 0.0206, + "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, - "routers_loss": 0.0027650354895740747, + "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, - "acc_skip": 0.0, - "avg_layers": 29.0, + "acc_skip": 0.5, + "avg_layers": 28.0, "epoch": 7.2113296154975055, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, - "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, - "loss": 0.0209, - "macro_f1": 0.6538461446762085, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, - "routers_loss": 0.023268593475222588, + "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 @@ -14609,13 +14609,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, - "routers_loss": 0.031235001981258392, + "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 @@ -14628,13 +14628,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.080078125, + "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, - "loss": 0.023, + "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, - "routers_loss": 0.042398080229759216, + "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 @@ -14647,13 +14647,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.099609375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, - "loss": 0.0268, + "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, - "routers_loss": 0.007361465133726597, + "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 @@ -14666,13 +14666,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, - "loss": 0.0166, + "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, - "routers_loss": 0.0052334014326334, + "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 @@ -14685,13 +14685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, - "routers_loss": 0.004993532784283161, + "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 @@ -14704,13 +14704,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, - "loss": 0.0248, + "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, - "routers_loss": 0.001611889572814107, + "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 @@ -14723,13 +14723,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, - "loss": 0.0209, + "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, - "routers_loss": 0.03059260919690132, + "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 @@ -14742,32 +14742,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, - "loss": 0.0195, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, - "routers_loss": 0.00508903618901968, + "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.295861461696507, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, + "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, - "loss": 0.0174, - "macro_f1": 0.3272727429866791, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, - "routers_loss": 0.007860450074076653, + "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 @@ -14780,13 +14780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, - "loss": 0.0217, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, - "routers_loss": 0.0027370608877390623, + "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 @@ -14799,13 +14799,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, - "loss": 0.0276, + "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, - "routers_loss": 0.061584725975990295, + "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 @@ -14818,13 +14818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, - "routers_loss": 0.01694316789507866, + "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 @@ -14837,13 +14837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, - "loss": 0.0234, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, - "routers_loss": 0.0023331891279667616, + "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 @@ -14856,13 +14856,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, - "loss": 0.0203, + "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, - "routers_loss": 0.021300682798027992, + "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 @@ -14875,13 +14875,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, - "loss": 0.026, + "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, - "routers_loss": 0.08335043489933014, + "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 @@ -14894,13 +14894,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, - "loss": 0.0224, + "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, - "routers_loss": 0.003535634372383356, + "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 @@ -14913,13 +14913,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, - "routers_loss": 0.025729363784193993, + "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 @@ -14932,13 +14932,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, - "loss": 0.0287, + "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, - "routers_loss": 0.059166863560676575, + "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 @@ -14951,13 +14951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, - "loss": 0.0173, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, - "routers_loss": 0.002580022206529975, + "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 @@ -14970,13 +14970,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, - "loss": 0.0257, + "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, - "routers_loss": 0.007746981456875801, + "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 @@ -14989,13 +14989,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, - "loss": 0.0344, + "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, - "routers_loss": 0.027308562770485878, + "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 @@ -15008,13 +15008,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, - "loss": 0.0277, + "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, - "routers_loss": 0.029863199219107628, + "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 @@ -15027,13 +15027,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, - "loss": 0.0218, + "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, - "routers_loss": 0.004019706044346094, + "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 @@ -15046,32 +15046,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, - "loss": 0.0239, + "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, - "routers_loss": 0.014162382110953331, + "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 7.446140299383622, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, - "loss": 0.0338, - "macro_f1": 0.32098764181137085, + "loss": 0.032, + "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, - "routers_loss": 0.023485012352466583, + "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 @@ -15079,37 +15079,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 7.455532726739067, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, - "loss": 0.0308, - "macro_f1": 0.3272727429866791, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, - "routers_loss": 0.05822715163230896, + "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { - "acc_repeat": 0.5, - "acc_skip": 0.5, + "acc_repeat": 1.0, + "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, - "f1_execute": 0.936170220375061, - "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.189453125, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, - "loss": 0.0243, - "macro_f1": 0.7565011978149414, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, - "routers_loss": 0.07448136061429977, + "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 @@ -15122,13 +15122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, - "loss": 0.0228, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, - "routers_loss": 0.0038424162194132805, + "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 @@ -15141,13 +15141,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, - "loss": 0.0277, + "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, - "routers_loss": 0.005674343090504408, + "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 @@ -15160,13 +15160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.189453125, + "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, - "loss": 0.0209, + "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, - "routers_loss": 0.015544800087809563, + "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 @@ -15179,13 +15179,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, - "loss": 0.0279, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, - "routers_loss": 0.013199405744671822, + "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 @@ -15198,13 +15198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, - "loss": 0.0178, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, - "routers_loss": 0.0032487998250871897, + "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 @@ -15217,13 +15217,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, - "loss": 0.0253, + "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, - "routers_loss": 0.0041928659193217754, + "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 @@ -15236,13 +15236,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, - "loss": 0.0343, + "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, - "routers_loss": 0.011576149612665176, + "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 @@ -15255,13 +15255,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, - "loss": 0.0179, + "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, - "routers_loss": 0.03026912547647953, + "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 @@ -15276,11 +15276,11 @@ "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, - "loss": 0.021, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, - "routers_loss": 0.014957098290324211, + "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 @@ -15293,13 +15293,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, - "loss": 0.0226, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, - "routers_loss": 0.00254683755338192, + "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 @@ -15312,13 +15312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, - "loss": 0.0286, + "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, - "routers_loss": 0.018759876489639282, + "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 @@ -15337,7 +15337,7 @@ "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, - "routers_loss": 0.022694367915391922, + "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 @@ -15350,13 +15350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, - "loss": 0.0181, + "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, - "routers_loss": 0.010102321393787861, + "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 @@ -15369,13 +15369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08984375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, - "loss": 0.0252, + "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, - "routers_loss": 0.0020994991064071655, + "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 @@ -15388,13 +15388,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.10009765625, + "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, - "loss": 0.0323, + "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, - "routers_loss": 0.11748704314231873, + "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 @@ -15407,32 +15407,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, - "loss": 0.018, + "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, - "routers_loss": 0.007642311509698629, + "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.62459641913707, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "f1_skip": 1.0, + "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, - "loss": 0.0258, - "macro_f1": 0.3272727429866791, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, - "routers_loss": 0.016890225932002068, + "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 @@ -15445,13 +15445,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, - "loss": 0.0242, + "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, - "routers_loss": 0.010900186374783516, + "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 @@ -15464,13 +15464,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, - "loss": 0.0279, + "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, - "routers_loss": 0.011229799129068851, + "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 @@ -15483,13 +15483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, - "loss": 0.0275, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, - "routers_loss": 0.0105878422036767, + "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 @@ -15502,13 +15502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, - "loss": 0.0209, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, - "routers_loss": 0.002953991526737809, + "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 @@ -15521,13 +15521,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, - "loss": 0.0241, + "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, - "routers_loss": 0.04717390984296799, + "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 @@ -15540,13 +15540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, - "loss": 0.0179, + "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, - "routers_loss": 0.0073657832108438015, + "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 @@ -15559,13 +15559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, - "loss": 0.0259, + "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, - "routers_loss": 0.005212752148509026, + "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 @@ -15578,13 +15578,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, - "loss": 0.0304, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, - "routers_loss": 0.04311618581414223, + "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 @@ -15597,13 +15597,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, - "loss": 0.039, + "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, - "routers_loss": 0.02027471922338009, + "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 @@ -15616,13 +15616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, - "loss": 0.0279, + "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, - "routers_loss": 0.003074956126511097, + "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 @@ -15635,13 +15635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, - "loss": 0.0241, + "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, - "routers_loss": 0.009374706074595451, + "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 @@ -15654,13 +15654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, - "routers_loss": 0.01204691268503666, + "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 @@ -15673,13 +15673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, - "routers_loss": 0.01376053225249052, + "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 @@ -15692,13 +15692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, - "loss": 0.0285, + "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, - "routers_loss": 0.009346984326839447, + "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 @@ -15711,13 +15711,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, - "loss": 0.0291, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, - "routers_loss": 0.002724624238908291, + "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 @@ -15730,13 +15730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.123046875, + "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, - "loss": 0.0271, + "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, - "routers_loss": 0.00823777075856924, + "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 @@ -15749,32 +15749,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, - "loss": 0.0262, + "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, - "routers_loss": 0.004393119364976883, + "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.7936601115350745, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1513671875, + "f1_skip": 0.0, + "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, - "loss": 0.024, - "macro_f1": 0.5492662787437439, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, - "routers_loss": 0.031827569007873535, + "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 @@ -15789,11 +15789,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, - "loss": 0.0216, + "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, - "routers_loss": 0.03329647704958916, + "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 @@ -15806,13 +15806,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, - "loss": 0.0415, + "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, - "routers_loss": 0.047317031770944595, + "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 @@ -15827,11 +15827,11 @@ "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, - "loss": 0.0325, + "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, - "routers_loss": 0.05649980902671814, + "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 @@ -15844,13 +15844,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, - "loss": 0.0229, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, - "routers_loss": 0.01004624180495739, + "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 @@ -15863,13 +15863,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, - "loss": 0.0194, + "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, - "routers_loss": 0.0054973396472632885, + "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 @@ -15882,13 +15882,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09619140625, + "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, - "loss": 0.031, + "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, - "routers_loss": 0.05615650862455368, + "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 @@ -15901,13 +15901,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, - "loss": 0.0301, + "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, - "routers_loss": 0.012819192372262478, + "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 @@ -15920,13 +15920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, - "loss": 0.0253, + "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, - "routers_loss": 0.07059332728385925, + "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 @@ -15939,13 +15939,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, - "loss": 0.0198, + "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, - "routers_loss": 0.029778441414237022, + "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 @@ -15958,13 +15958,13 @@ "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, - "grad_norm": 0.1416015625, + "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, - "loss": 0.0474, + "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, - "routers_loss": 0.04371272772550583, + "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 @@ -15977,13 +15977,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, - "loss": 0.0293, + "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, - "routers_loss": 0.0033591394312679768, + "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 @@ -15996,13 +15996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, - "loss": 0.0259, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, - "routers_loss": 0.005085585173219442, + "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 @@ -16015,13 +16015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, - "loss": 0.0243, + "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, - "routers_loss": 0.008569681085646152, + "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 @@ -16034,32 +16034,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, - "loss": 0.022, + "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, - "routers_loss": 0.024587804451584816, + "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, - "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, "epoch": 7.934546521866745, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.169921875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, - "loss": 0.0332, - "macro_f1": 0.4871794879436493, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, - "routers_loss": 0.037516288459300995, + "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 @@ -16072,13 +16072,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, - "loss": 0.0262, + "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, - "routers_loss": 0.01287431176751852, + "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 @@ -16091,13 +16091,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07763671875, + "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, - "loss": 0.0227, + "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, - "routers_loss": 0.015499880537390709, + "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 @@ -16110,13 +16110,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, - "loss": 0.0144, + "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, - "routers_loss": 0.049878787249326706, + "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 @@ -16129,13 +16129,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, - "loss": 0.0206, + "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, - "routers_loss": 0.04108169302344322, + "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 @@ -16148,13 +16148,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, - "loss": 0.0367, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, - "routers_loss": 0.007651917636394501, + "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 @@ -16167,13 +16167,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, - "routers_loss": 0.015448091551661491, + "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 @@ -16186,13 +16186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, - "loss": 0.0324, + "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, - "routers_loss": 0.009139287285506725, + "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 @@ -16205,13 +16205,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0869140625, + "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, - "loss": 0.0191, + "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, - "routers_loss": 0.015412120148539543, + "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 @@ -16224,13 +16224,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, - "routers_loss": 0.012735052965581417, + "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 @@ -16243,13 +16243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, - "loss": 0.0192, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, - "routers_loss": 0.00114025070797652, + "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 @@ -16262,13 +16262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, - "loss": 0.0243, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, - "routers_loss": 0.006401443853974342, + "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 @@ -16281,13 +16281,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, - "loss": 0.0185, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, - "routers_loss": 0.004316259175539017, + "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 @@ -16300,13 +16300,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, - "loss": 0.0201, + "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, - "routers_loss": 0.043461959809064865, + "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 @@ -16319,13 +16319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, - "routers_loss": 0.0024530428927391768, + "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 @@ -16338,13 +16338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, - "loss": 0.026, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, - "routers_loss": 0.0046626063995063305, + "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 @@ -16357,13 +16357,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1513671875, + "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, - "loss": 0.0257, + "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, - "routers_loss": 0.02480102889239788, + "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 @@ -16376,13 +16376,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, - "loss": 0.02, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, - "routers_loss": 0.012629947625100613, + "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 @@ -16395,13 +16395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, - "routers_loss": 0.0024127380456775427, + "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 @@ -16414,13 +16414,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, - "loss": 0.0162, + "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, - "routers_loss": 0.0415453165769577, + "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 @@ -16433,13 +16433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, - "loss": 0.0211, + "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, - "routers_loss": 0.0056681083515286446, + "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 @@ -16452,13 +16452,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, - "loss": 0.0189, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, - "routers_loss": 0.006391602102667093, + "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 @@ -16471,13 +16471,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, - "loss": 0.0229, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, - "routers_loss": 0.007466991897672415, + "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 @@ -16490,13 +16490,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, - "loss": 0.0197, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, - "routers_loss": 0.001953453291207552, + "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 @@ -16509,13 +16509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, - "loss": 0.0223, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, - "routers_loss": 0.003612719476222992, + "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 @@ -16528,13 +16528,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, - "routers_loss": 0.009977150708436966, + "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 @@ -16547,13 +16547,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.10791015625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, - "routers_loss": 0.026468059048056602, + "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 @@ -16566,13 +16566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, - "routers_loss": 0.00849854201078415, + "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 @@ -16585,13 +16585,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, - "loss": 0.0275, + "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, - "routers_loss": 0.08082596957683563, + "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 @@ -16604,13 +16604,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.046142578125, + "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, - "loss": 0.015, + "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, - "routers_loss": 0.029500717297196388, + "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 @@ -16623,13 +16623,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, - "loss": 0.0183, + "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, - "routers_loss": 0.025238536298274994, + "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 @@ -16642,13 +16642,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, - "loss": 0.0204, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, - "routers_loss": 0.002069319598376751, + "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 @@ -16661,13 +16661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, - "loss": 0.0169, + "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, - "routers_loss": 0.002853946527466178, + "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 @@ -16680,13 +16680,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, - "loss": 0.0236, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, - "routers_loss": 0.0100983502343297, + "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 @@ -16699,13 +16699,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, - "loss": 0.0173, + "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, - "routers_loss": 0.031218983232975006, + "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 @@ -16718,13 +16718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, - "loss": 0.019, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, - "routers_loss": 0.010347879491746426, + "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 @@ -16737,13 +16737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, - "loss": 0.0193, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, - "routers_loss": 0.007768871728330851, + "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 @@ -16756,13 +16756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, - "loss": 0.0253, + "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, - "routers_loss": 0.002887974726036191, + "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 @@ -16777,11 +16777,11 @@ "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, - "loss": 0.0147, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, - "routers_loss": 0.0027294005267322063, + "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 @@ -16794,13 +16794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, - "loss": 0.0193, + "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, - "routers_loss": 0.00543541694059968, + "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 @@ -16813,13 +16813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, - "routers_loss": 0.006514009553939104, + "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 @@ -16832,13 +16832,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.06396484375, + "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, - "loss": 0.019, + "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, - "routers_loss": 0.02333846502006054, + "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 @@ -16851,13 +16851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, - "routers_loss": 0.004471905063837767, + "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 @@ -16870,13 +16870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, - "loss": 0.0261, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, - "routers_loss": 0.0024362702388316393, + "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 @@ -16889,13 +16889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, - "loss": 0.0116, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, - "routers_loss": 0.0021166049409657717, + "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 @@ -16908,13 +16908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, - "routers_loss": 0.004722296260297298, + "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 @@ -16929,11 +16929,11 @@ "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, - "loss": 0.0199, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, - "routers_loss": 0.014188882894814014, + "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 @@ -16946,13 +16946,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.076171875, + "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, - "loss": 0.0241, + "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, - "routers_loss": 0.04599560424685478, + "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 @@ -16965,13 +16965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, - "loss": 0.0216, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, - "routers_loss": 0.004942454397678375, + "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 @@ -16984,13 +16984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, - "loss": 0.021, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, - "routers_loss": 0.0020542226266115904, + "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 @@ -17003,13 +17003,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1171875, + "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, - "loss": 0.0155, + "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, - "routers_loss": 0.0397041030228138, + "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 @@ -17022,13 +17022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, - "loss": 0.0204, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, - "routers_loss": 0.0017490010941401124, + "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 @@ -17041,13 +17041,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, - "loss": 0.021, + "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, - "routers_loss": 0.023590171709656715, + "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 @@ -17060,13 +17060,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, - "loss": 0.0125, + "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, - "routers_loss": 0.02458076737821102, + "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 @@ -17079,13 +17079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, - "loss": 0.019, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, - "routers_loss": 0.0014341498026624322, + "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 @@ -17098,13 +17098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, - "loss": 0.02, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, - "routers_loss": 0.00213223067112267, + "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 @@ -17117,13 +17117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, - "loss": 0.0147, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, - "routers_loss": 0.0027340995147824287, + "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 @@ -17136,32 +17136,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, - "loss": 0.0172, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, - "routers_loss": 0.0035587961319833994, + "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 8.479013795127678, - "f1_execute": 0.9615384340286255, - "f1_repeat": 0.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, - "loss": 0.0187, - "macro_f1": 0.5427350401878357, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, - "routers_loss": 0.04446055367588997, + "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 @@ -17174,13 +17174,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, - "loss": 0.0244, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, - "routers_loss": 0.05095123499631882, + "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 @@ -17193,13 +17193,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, - "loss": 0.0232, + "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, - "routers_loss": 0.008440068922936916, + "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 @@ -17212,13 +17212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, - "loss": 0.0273, + "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, - "routers_loss": 0.012230116873979568, + "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 @@ -17231,13 +17231,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, - "loss": 0.026, + "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, - "routers_loss": 0.017307188361883163, + "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 @@ -17250,13 +17250,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, - "loss": 0.0215, + "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, - "routers_loss": 0.07191162556409836, + "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 @@ -17269,13 +17269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, - "loss": 0.0182, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, - "routers_loss": 0.008753604255616665, + "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 @@ -17288,13 +17288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, - "loss": 0.0233, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, - "routers_loss": 0.008390828967094421, + "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 @@ -17307,13 +17307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, - "routers_loss": 0.005617359187453985, + "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 @@ -17326,13 +17326,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, - "loss": 0.0227, + "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, - "routers_loss": 0.0346846878528595, + "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 @@ -17345,32 +17345,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, - "routers_loss": 0.006601692643016577, + "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 8.582330496037569, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, - "loss": 0.0207, - "macro_f1": 0.6666666865348816, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, - "routers_loss": 0.0065619745291769505, + "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 @@ -17383,13 +17383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, - "loss": 0.0149, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, - "routers_loss": 0.011109639890491962, + "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 @@ -17404,11 +17404,11 @@ "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, - "loss": 0.0167, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, - "routers_loss": 0.008432094007730484, + "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 @@ -17421,13 +17421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, - "loss": 0.0208, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, - "routers_loss": 0.011518111452460289, + "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 @@ -17440,13 +17440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, - "loss": 0.0211, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, - "routers_loss": 0.0026744187343865633, + "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 @@ -17459,13 +17459,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, - "loss": 0.0165, + "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, - "routers_loss": 0.028107430785894394, + "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 @@ -17478,13 +17478,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.095703125, + "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, - "loss": 0.0263, + "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, - "routers_loss": 0.060007549822330475, + "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 @@ -17497,13 +17497,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, - "routers_loss": 0.01074182614684105, + "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 @@ -17516,13 +17516,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, - "loss": 0.0213, + "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, - "routers_loss": 0.0019638657104223967, + "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 @@ -17535,13 +17535,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1572265625, + "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, - "loss": 0.0156, + "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, - "routers_loss": 0.06953249871730804, + "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 @@ -17554,13 +17554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, - "loss": 0.0154, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, - "routers_loss": 0.003563052974641323, + "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 @@ -17573,13 +17573,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, - "loss": 0.0216, + "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, - "routers_loss": 0.010409255512058735, + "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 @@ -17592,13 +17592,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, - "loss": 0.0195, + "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, - "routers_loss": 0.009769548662006855, + "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 @@ -17611,13 +17611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, - "loss": 0.0184, + "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, - "routers_loss": 0.036616452038288116, + "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 @@ -17630,13 +17630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, - "loss": 0.0192, + "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, - "routers_loss": 0.009581349790096283, + "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 @@ -17649,13 +17649,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, - "loss": 0.0138, + "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, - "routers_loss": 0.02330300398170948, + "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 @@ -17668,13 +17668,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, - "loss": 0.0226, + "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, - "routers_loss": 0.011985735036432743, + "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 @@ -17687,13 +17687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, - "loss": 0.0162, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, - "routers_loss": 0.005997250322252512, + "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 @@ -17706,13 +17706,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, - "loss": 0.0243, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, - "routers_loss": 0.004814761225134134, + "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 @@ -17725,13 +17725,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, - "loss": 0.0242, + "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, - "routers_loss": 0.004750931169837713, + "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 @@ -17744,13 +17744,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, - "loss": 0.0216, + "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, - "routers_loss": 0.038251202553510666, + "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 @@ -17763,13 +17763,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06640625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, - "loss": 0.0204, + "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, - "routers_loss": 0.010951942764222622, + "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 @@ -17782,13 +17782,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, - "loss": 0.0265, + "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, - "routers_loss": 0.06582094728946686, + "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 @@ -17796,18 +17796,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 8.798356325212797, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "f1_skip": 1.0, + "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, - "loss": 0.0143, - "macro_f1": 0.5492662787437439, + "loss": 0.014, + "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, - "routers_loss": 0.008920271880924702, + "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 @@ -17820,13 +17820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, - "routers_loss": 0.006444344762712717, + "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 @@ -17839,13 +17839,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, - "loss": 0.022, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, - "routers_loss": 0.05197767913341522, + "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 @@ -17858,13 +17858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, - "loss": 0.0224, + "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, - "routers_loss": 0.017570581287145615, + "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 @@ -17877,13 +17877,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, - "loss": 0.0225, + "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, - "routers_loss": 0.07106777280569077, + "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 @@ -17896,13 +17896,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, - "loss": 0.0139, + "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, - "routers_loss": 0.009862381964921951, + "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 @@ -17915,13 +17915,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, - "loss": 0.0209, + "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, - "routers_loss": 0.006928981747478247, + "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 @@ -17934,13 +17934,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09423828125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, - "routers_loss": 0.04788029566407204, + "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 @@ -17953,13 +17953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, - "loss": 0.0299, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, - "routers_loss": 0.008282946422696114, + "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 @@ -17972,32 +17972,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.09716796875, + "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, - "loss": 0.0181, + "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, - "routers_loss": 0.03251546248793602, + "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 8.892280598767243, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06640625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, - "loss": 0.0195, - "macro_f1": 0.542222261428833, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, - "routers_loss": 0.03368280455470085, + "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 @@ -18010,13 +18010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, - "routers_loss": 0.0069940583780407906, + "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 @@ -18029,13 +18029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, - "loss": 0.0221, + "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, - "routers_loss": 0.004268508404493332, + "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 @@ -18048,13 +18048,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, - "loss": 0.0159, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, - "routers_loss": 0.0032616283278912306, + "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 @@ -18067,13 +18067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, - "loss": 0.023, + "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, - "routers_loss": 0.005389219615608454, + "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 @@ -18086,13 +18086,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, - "loss": 0.0311, + "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, - "routers_loss": 0.024814991280436516, + "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 @@ -18107,11 +18107,11 @@ "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, - "loss": 0.0151, + "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, - "routers_loss": 0.013415839523077011, + "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 @@ -18124,13 +18124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, - "loss": 0.019, + "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, - "routers_loss": 0.005814475007355213, + "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 @@ -18143,13 +18143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, - "loss": 0.0218, + "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, - "routers_loss": 0.007621586322784424, + "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 @@ -18162,13 +18162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, - "loss": 0.0178, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, - "routers_loss": 0.0020917020738124847, + "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 @@ -18181,13 +18181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, - "loss": 0.025, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, - "routers_loss": 0.0010824954370036721, + "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 @@ -18200,13 +18200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, - "routers_loss": 0.0018075350672006607, + "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 @@ -18219,13 +18219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, - "loss": 0.0235, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, - "routers_loss": 0.0032930250745266676, + "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 @@ -18238,13 +18238,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, - "loss": 0.0184, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, - "routers_loss": 0.009042349644005299, + "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 @@ -18257,13 +18257,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1103515625, + "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, - "loss": 0.022, + "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, - "routers_loss": 0.016776500269770622, + "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 @@ -18276,13 +18276,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05029296875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, - "loss": 0.016, + "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, - "routers_loss": 0.06579705327749252, + "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 @@ -18295,13 +18295,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, - "routers_loss": 0.0022786022163927555, + "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 @@ -18314,13 +18314,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, - "routers_loss": 0.01074281521141529, + "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 @@ -18333,13 +18333,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, - "loss": 0.0201, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, - "routers_loss": 0.0032052614260464907, + "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 @@ -18352,32 +18352,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, - "loss": 0.0186, + "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, - "routers_loss": 0.008593574166297913, + "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.079835632521279, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.07373046875, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, - "loss": 0.0152, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, - "routers_loss": 0.0201246440410614, + "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 @@ -18390,13 +18390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, - "loss": 0.0199, + "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, - "routers_loss": 0.001721356064081192, + "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 @@ -18409,13 +18409,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, - "loss": 0.0135, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, - "routers_loss": 0.010442634113132954, + "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 @@ -18428,13 +18428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, - "loss": 0.019, + "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, - "routers_loss": 0.0009493798715993762, + "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 @@ -18447,13 +18447,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, - "loss": 0.0149, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, - "routers_loss": 0.022104881703853607, + "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 @@ -18466,13 +18466,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, - "loss": 0.0164, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, - "routers_loss": 0.0009013625676743686, + "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 @@ -18485,13 +18485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, - "loss": 0.0121, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, - "routers_loss": 0.0028069843538105488, + "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 @@ -18504,13 +18504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, - "loss": 0.0116, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, - "routers_loss": 0.006877045147120953, + "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 @@ -18523,13 +18523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, - "routers_loss": 0.004543667659163475, + "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 @@ -18542,13 +18542,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, - "loss": 0.0232, + "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, - "routers_loss": 0.007053609937429428, + "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 @@ -18561,13 +18561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, - "routers_loss": 0.006783280987292528, + "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 @@ -18580,13 +18580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, - "loss": 0.0181, + "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, - "routers_loss": 0.002531677018851042, + "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 @@ -18599,32 +18599,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, - "loss": 0.0154, + "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, - "routers_loss": 0.027040868997573853, + "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.201937188142061, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, - "loss": 0.0154, - "macro_f1": 0.6601307392120361, + "loss": 0.0158, + "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, - "routers_loss": 0.01573321223258972, + "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 @@ -18637,13 +18637,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, - "routers_loss": 0.02442278526723385, + "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 @@ -18656,13 +18656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, - "loss": 0.0142, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, - "routers_loss": 0.018267054110765457, + "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 @@ -18675,13 +18675,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, - "loss": 0.0162, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, - "routers_loss": 0.0016024474753066897, + "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 @@ -18694,13 +18694,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, - "routers_loss": 0.004668849054723978, + "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 @@ -18713,13 +18713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, - "loss": 0.0098, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, - "routers_loss": 0.0011657609138637781, + "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 @@ -18732,13 +18732,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.04248046875, + "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, - "loss": 0.0133, + "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, - "routers_loss": 0.03806794434785843, + "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 @@ -18751,13 +18751,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, - "loss": 0.0189, + "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, - "routers_loss": 0.005107097327709198, + "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 @@ -18770,13 +18770,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, - "routers_loss": 0.0013696947135031223, + "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 @@ -18789,13 +18789,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, - "loss": 0.0136, + "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, - "routers_loss": 0.0012224154779687524, + "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 @@ -18808,13 +18808,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, - "routers_loss": 0.0030119111761450768, + "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 @@ -18827,13 +18827,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, - "loss": 0.0222, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, - "routers_loss": 0.04286033287644386, + "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 @@ -18846,32 +18846,32 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, - "loss": 0.0158, + "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, - "routers_loss": 0.019988590851426125, + "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.324038743762841, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, - "loss": 0.0154, - "macro_f1": 0.3272727429866791, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, - "routers_loss": 0.012215938419103622, + "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 @@ -18884,13 +18884,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, - "routers_loss": 0.10747655481100082, + "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 @@ -18903,13 +18903,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, - "loss": 0.0186, + "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, - "routers_loss": 0.016109853982925415, + "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 @@ -18922,13 +18922,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, - "loss": 0.0116, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, - "routers_loss": 0.006929324474185705, + "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 @@ -18941,13 +18941,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, - "routers_loss": 0.0715102106332779, + "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 @@ -18960,13 +18960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, - "loss": 0.0187, + "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, - "routers_loss": 0.008499355986714363, + "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 @@ -18979,13 +18979,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, - "loss": 0.0162, + "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, - "routers_loss": 0.012003371492028236, + "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 @@ -18998,13 +18998,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, - "loss": 0.0137, + "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, - "routers_loss": 0.0529167577624321, + "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 @@ -19017,13 +19017,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.07568359375, "learning_rate": 0.0009470039504537173, - "loss": 0.0185, + "loss": 0.0186, "macro_f1": 0.31446540355682373, "num_tokens": 3230031.0, "repeat_count": 0.0, - "routers_loss": 0.05719539523124695, + "routers_loss": 0.052884332835674286, "skip_count": 2.0, "step": 2002, "text_loss": 0.1741616576910019 @@ -19038,11 +19038,11 @@ "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009468651855706931, - "loss": 0.0205, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 3232991.0, "repeat_count": 1.0, - "routers_loss": 0.007613501511514187, + "routers_loss": 0.008056716993451118, "skip_count": 0.0, "step": 2004, "text_loss": 0.3173636198043823 @@ -19055,13 +19055,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0654296875, "learning_rate": 0.0009467262494480868, - "loss": 0.014, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3236390.0, "repeat_count": 0.0, - "routers_loss": 0.005654903594404459, + "routers_loss": 0.0053409393876791, "skip_count": 0.0, "step": 2006, "text_loss": 0.5806330442428589 @@ -19074,13 +19074,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.068359375, "learning_rate": 0.000946587142139139, - "loss": 0.0152, + "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 3239267.0, "repeat_count": 0.0, - "routers_loss": 0.001680699409916997, + "routers_loss": 0.0015652200672775507, "skip_count": 0.0, "step": 2008, "text_loss": 0.6214317679405212 @@ -19093,13 +19093,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.11376953125, "learning_rate": 0.000946447863697156, - "loss": 0.0171, + "loss": 0.0151, "macro_f1": 0.6601307392120361, "num_tokens": 3242569.0, "repeat_count": 1.0, - "routers_loss": 0.014179535210132599, + "routers_loss": 0.011673987843096256, "skip_count": 2.0, "step": 2010, "text_loss": 0.532565712928772 @@ -19112,13 +19112,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04345703125, "learning_rate": 0.0009463084141755093, - "loss": 0.0157, + "loss": 0.0159, "macro_f1": 0.3272727429866791, "num_tokens": 3245669.0, "repeat_count": 0.0, - "routers_loss": 0.026209332048892975, + "routers_loss": 0.028480790555477142, "skip_count": 1.0, "step": 2012, "text_loss": 0.25210800766944885 @@ -19131,13 +19131,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009461687936276364, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3248751.0, "repeat_count": 0.0, - "routers_loss": 0.008315940387547016, + "routers_loss": 0.007234727032482624, "skip_count": 0.0, "step": 2014, "text_loss": 0.35922971367836 @@ -19150,13 +19150,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.068359375, "learning_rate": 0.0009460290021070402, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6666666865348816, "num_tokens": 3252614.0, "repeat_count": 1.0, - "routers_loss": 0.01872348040342331, + "routers_loss": 0.014691276475787163, "skip_count": 0.0, "step": 2016, "text_loss": 0.2747853398323059 @@ -19169,13 +19169,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009458890396672888, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3256374.0, "repeat_count": 0.0, - "routers_loss": 0.0024314222391694784, + "routers_loss": 0.002385235857218504, "skip_count": 0.0, "step": 2018, "text_loss": 0.5268719792366028 @@ -19188,13 +19188,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052978515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009457489063620164, - "loss": 0.0137, + "loss": 0.0133, "macro_f1": 0.8823530077934265, "num_tokens": 3259792.0, "repeat_count": 1.0, - "routers_loss": 0.04815426841378212, + "routers_loss": 0.047268565744161606, "skip_count": 2.0, "step": 2020, "text_loss": 0.7785539627075195 @@ -19207,13 +19207,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1494140625, "learning_rate": 0.0009456086022449221, - "loss": 0.0209, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 3262833.0, "repeat_count": 0.0, - "routers_loss": 0.015121756121516228, + "routers_loss": 0.015878718346357346, "skip_count": 1.0, "step": 2022, "text_loss": 0.42270028591156006 @@ -19226,32 +19226,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.08935546875, "learning_rate": 0.0009454681273697711, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3272727429866791, "num_tokens": 3265718.0, "repeat_count": 1.0, - "routers_loss": 0.030219297856092453, + "routers_loss": 0.030749641358852386, "skip_count": 0.0, "step": 2024, "text_loss": 0.18668225407600403 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.511887290871735, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, "learning_rate": 0.0009453274817903931, - "loss": 0.0132, - "macro_f1": 0.3272727429866791, + "loss": 0.012, + "macro_f1": 0.6666666865348816, "num_tokens": 3268158.0, "repeat_count": 0.0, - "routers_loss": 0.013256299309432507, + "routers_loss": 0.011538166552782059, "skip_count": 1.0, "step": 2026, "text_loss": 0.34090787172317505 @@ -19264,13 +19264,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.099609375, "learning_rate": 0.000945186665560684, - "loss": 0.0232, + "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 3271082.0, "repeat_count": 0.0, - "routers_loss": 0.009389489889144897, + "routers_loss": 0.009527760557830334, "skip_count": 0.0, "step": 2028, "text_loss": 0.2110334187746048 @@ -19283,13 +19283,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.119140625, "learning_rate": 0.000945045678734605, - "loss": 0.0178, + "loss": 0.0175, "macro_f1": 0.3144654333591461, "num_tokens": 3273488.0, "repeat_count": 0.0, - "routers_loss": 0.03916877508163452, + "routers_loss": 0.03317151218652725, "skip_count": 3.0, "step": 2030, "text_loss": 0.2233227640390396 @@ -19302,13 +19302,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12451171875, "learning_rate": 0.0009449045213661822, - "loss": 0.0215, + "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 3276646.0, "repeat_count": 0.0, - "routers_loss": 0.019781047478318214, + "routers_loss": 0.018510591238737106, "skip_count": 1.0, "step": 2032, "text_loss": 0.16100332140922546 @@ -19321,13 +19321,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.11474609375, + "grad_norm": 0.1318359375, "learning_rate": 0.0009447631935095077, - "loss": 0.0193, + "loss": 0.0185, "macro_f1": 0.9452888369560242, "num_tokens": 3279441.0, "repeat_count": 1.0, - "routers_loss": 0.02645993046462536, + "routers_loss": 0.028113311156630516, "skip_count": 4.0, "step": 2034, "text_loss": 0.29208317399024963 @@ -19340,13 +19340,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009446216952187384, - "loss": 0.0168, + "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3282697.0, "repeat_count": 0.0, - "routers_loss": 0.008575125597417355, + "routers_loss": 0.008379172533750534, "skip_count": 0.0, "step": 2036, "text_loss": 0.16026398539543152 @@ -19359,13 +19359,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009444800265480967, - "loss": 0.0184, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3285574.0, "repeat_count": 0.0, - "routers_loss": 0.01042154710739851, + "routers_loss": 0.00941354501992464, "skip_count": 0.0, "step": 2038, "text_loss": 0.29523080587387085 @@ -19378,13 +19378,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.8571428656578064, "f1_skip": 0.800000011920929, - "grad_norm": 0.07568359375, + "grad_norm": 0.076171875, "learning_rate": 0.0009443381875518703, - "loss": 0.0206, + "loss": 0.0197, "macro_f1": 0.8600732684135437, "num_tokens": 3289159.0, "repeat_count": 4.0, - "routers_loss": 0.05496715381741524, + "routers_loss": 0.04974055662751198, "skip_count": 6.0, "step": 2040, "text_loss": 0.23033179342746735 @@ -19397,13 +19397,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.0537109375, "learning_rate": 0.0009441961782844123, - "loss": 0.0149, + "loss": 0.0146, "macro_f1": 0.3272727429866791, "num_tokens": 3293598.0, "repeat_count": 0.0, - "routers_loss": 0.021722445264458656, + "routers_loss": 0.022241825237870216, "skip_count": 1.0, "step": 2042, "text_loss": 0.8299165368080139 @@ -19416,13 +19416,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009440539988001408, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3296648.0, "repeat_count": 0.0, - "routers_loss": 0.011090370826423168, + "routers_loss": 0.011019332334399223, "skip_count": 0.0, "step": 2044, "text_loss": 0.18207129836082458 @@ -19435,13 +19435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009439116491535394, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3300058.0, "repeat_count": 0.0, - "routers_loss": 0.00327755743637681, + "routers_loss": 0.002889640862122178, "skip_count": 0.0, "step": 2046, "text_loss": 0.7051978707313538 @@ -19454,13 +19454,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.5, "f1_skip": 0.8571428656578064, - "grad_norm": 0.08154296875, + "grad_norm": 0.078125, "learning_rate": 0.0009437691293991563, - "loss": 0.0198, + "loss": 0.0192, "macro_f1": 0.7634921073913574, "num_tokens": 3303296.0, "repeat_count": 3.0, - "routers_loss": 0.0807223841547966, + "routers_loss": 0.07741832733154297, "skip_count": 4.0, "step": 2048, "text_loss": 0.15563532710075378 @@ -19473,13 +19473,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.09521484375, "learning_rate": 0.0009436264395916061, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 3306204.0, "repeat_count": 0.0, - "routers_loss": 0.014681774191558361, + "routers_loss": 0.014225383289158344, "skip_count": 2.0, "step": 2050, "text_loss": 0.18117287755012512 @@ -19492,13 +19492,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1416015625, "learning_rate": 0.0009434835797855672, - "loss": 0.0166, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 3309444.0, "repeat_count": 0.0, - "routers_loss": 0.0025602662935853004, + "routers_loss": 0.0023932650219649076, "skip_count": 0.0, "step": 2052, "text_loss": 0.4645874798297882 @@ -19511,13 +19511,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05810546875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009433405500357839, - "loss": 0.0148, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3312488.0, "repeat_count": 0.0, - "routers_loss": 0.03283753618597984, + "routers_loss": 0.03193361684679985, "skip_count": 1.0, "step": 2054, "text_loss": 0.5291082859039307 @@ -19530,13 +19530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009431973503970655, - "loss": 0.0138, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3315765.0, "repeat_count": 0.0, - "routers_loss": 0.002137230010703206, + "routers_loss": 0.0020529816392809153, "skip_count": 0.0, "step": 2056, "text_loss": 0.5877931118011475 @@ -19549,13 +19549,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009430539809242864, - "loss": 0.0199, + "loss": 0.0185, "macro_f1": 0.32098764181137085, "num_tokens": 3318877.0, "repeat_count": 2.0, - "routers_loss": 0.07938452064990997, + "routers_loss": 0.07907948642969131, "skip_count": 0.0, "step": 2058, "text_loss": 0.3836737871170044 @@ -19568,13 +19568,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.095703125, "learning_rate": 0.0009429104416723862, - "loss": 0.0164, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 3322576.0, "repeat_count": 2.0, - "routers_loss": 0.003832251997664571, + "routers_loss": 0.003006070153787732, "skip_count": 0.0, "step": 2060, "text_loss": 0.3480920195579529 @@ -19587,13 +19587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.045166015625, "learning_rate": 0.0009427667326963689, - "loss": 0.0131, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3325974.0, "repeat_count": 0.0, - "routers_loss": 0.006192604545503855, + "routers_loss": 0.005013179033994675, "skip_count": 0.0, "step": 2062, "text_loss": 0.931358814239502 @@ -19606,13 +19606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009426228540513047, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 3329398.0, "repeat_count": 0.0, - "routers_loss": 0.008115313947200775, + "routers_loss": 0.0059848143719136715, "skip_count": 0.0, "step": 2064, "text_loss": 0.47568953037261963 @@ -19625,13 +19625,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009424788057923277, - "loss": 0.0127, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3332029.0, "repeat_count": 0.0, - "routers_loss": 0.007599714212119579, + "routers_loss": 0.00783882662653923, "skip_count": 0.0, "step": 2066, "text_loss": 0.22887596487998962 @@ -19644,13 +19644,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07470703125, + "grad_norm": 0.0712890625, "learning_rate": 0.0009423345879746376, - "loss": 0.0126, + "loss": 0.0128, "macro_f1": 0.5492662787437439, "num_tokens": 3334858.0, "repeat_count": 0.0, - "routers_loss": 0.016804348677396774, + "routers_loss": 0.01866884157061577, "skip_count": 2.0, "step": 2068, "text_loss": 0.17724967002868652 @@ -19663,13 +19663,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.06591796875, "learning_rate": 0.000942190200653499, - "loss": 0.0164, + "loss": 0.0162, "macro_f1": 0.32098764181137085, "num_tokens": 3338094.0, "repeat_count": 0.0, - "routers_loss": 0.02686731517314911, + "routers_loss": 0.028636593371629715, "skip_count": 2.0, "step": 2070, "text_loss": 0.34344956278800964 @@ -19682,13 +19682,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009420456438842413, - "loss": 0.0172, + "loss": 0.0165, "macro_f1": 0.5492662787437439, "num_tokens": 3340526.0, "repeat_count": 0.0, - "routers_loss": 0.025320913642644882, + "routers_loss": 0.023245645686984062, "skip_count": 2.0, "step": 2072, "text_loss": 0.7276164293289185 @@ -19701,13 +19701,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11328125, "learning_rate": 0.000941900917722259, - "loss": 0.0145, + "loss": 0.0143, "macro_f1": 0.3272727429866791, "num_tokens": 3343303.0, "repeat_count": 1.0, - "routers_loss": 0.014900023117661476, + "routers_loss": 0.01565689593553543, "skip_count": 0.0, "step": 2074, "text_loss": 0.5665070414543152 @@ -19720,13 +19720,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.1201171875, "learning_rate": 0.0009417560222230115, - "loss": 0.0244, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 3346409.0, "repeat_count": 0.0, - "routers_loss": 0.003426895011216402, + "routers_loss": 0.0035056080669164658, "skip_count": 0.0, "step": 2076, "text_loss": 0.5112795233726501 @@ -19739,13 +19739,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.06982421875, "learning_rate": 0.0009416109574420229, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3349220.0, "repeat_count": 0.0, - "routers_loss": 0.0031935563310980797, + "routers_loss": 0.0027565446216613054, "skip_count": 0.0, "step": 2078, "text_loss": 0.5240910053253174 @@ -19758,13 +19758,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.08203125, "learning_rate": 0.0009414657234348823, - "loss": 0.0183, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 3352627.0, "repeat_count": 3.0, - "routers_loss": 0.016454946249723434, + "routers_loss": 0.01652451977133751, "skip_count": 2.0, "step": 2080, "text_loss": 1.0217112302780151 @@ -19777,13 +19777,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009413203202572438, - "loss": 0.0174, + "loss": 0.0179, "macro_f1": 0.32098764181137085, "num_tokens": 3355392.0, "repeat_count": 0.0, - "routers_loss": 0.1056143268942833, + "routers_loss": 0.1012420505285263, "skip_count": 2.0, "step": 2082, "text_loss": 0.4085482358932495 @@ -19796,13 +19796,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.08251953125, "learning_rate": 0.000941174747964826, - "loss": 0.016, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3358425.0, "repeat_count": 0.0, - "routers_loss": 0.003626141929998994, + "routers_loss": 0.004962718114256859, "skip_count": 0.0, "step": 2084, "text_loss": 0.5833504796028137 @@ -19810,18 +19810,18 @@ { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.793660111535075, - "f1_execute": 0.936170220375061, + "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.107421875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, "learning_rate": 0.0009410290066134124, - "loss": 0.0216, - "macro_f1": 0.7565011978149414, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, "num_tokens": 3361925.0, "repeat_count": 2.0, - "routers_loss": 0.08091846853494644, + "routers_loss": 0.07889176905155182, "skip_count": 3.0, "step": 2086, "text_loss": 0.38126569986343384 @@ -19834,13 +19834,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.051513671875, "learning_rate": 0.0009408830962588517, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6601307392120361, "num_tokens": 3365963.0, "repeat_count": 1.0, - "routers_loss": 0.035208042711019516, + "routers_loss": 0.033715736120939255, "skip_count": 2.0, "step": 2088, "text_loss": 0.23213914036750793 @@ -19853,13 +19853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009407370169570567, - "loss": 0.0167, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3369422.0, "repeat_count": 0.0, - "routers_loss": 0.0018934847321361303, + "routers_loss": 0.0014188943896442652, "skip_count": 0.0, "step": 2090, "text_loss": 0.4648318886756897 @@ -19872,13 +19872,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009405907687640054, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 3372506.0, "repeat_count": 0.0, - "routers_loss": 0.016075141727924347, + "routers_loss": 0.015339684672653675, "skip_count": 1.0, "step": 2092, "text_loss": 0.2563800811767578 @@ -19891,13 +19891,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.054443359375, "learning_rate": 0.0009404443517357404, "loss": 0.0146, "macro_f1": 0.542222261428833, "num_tokens": 3375653.0, "repeat_count": 4.0, - "routers_loss": 0.06333976984024048, + "routers_loss": 0.06562861055135727, "skip_count": 0.0, "step": 2094, "text_loss": 0.797835111618042 @@ -19910,13 +19910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.062255859375, "learning_rate": 0.000940297765928369, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3379018.0, "repeat_count": 0.0, - "routers_loss": 0.005521406419575214, + "routers_loss": 0.005745889153331518, "skip_count": 0.0, "step": 2096, "text_loss": 0.4238114655017853 @@ -19929,13 +19929,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009401510113980631, - "loss": 0.0205, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3382855.0, "repeat_count": 0.0, - "routers_loss": 0.0025159218348562717, + "routers_loss": 0.0026634482201188803, "skip_count": 0.0, "step": 2098, "text_loss": 0.4967166483402252 @@ -19948,13 +19948,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009400040882010592, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 3386386.0, "repeat_count": 0.0, - "routers_loss": 0.0025535966269671917, + "routers_loss": 0.0020642587915062904, "skip_count": 0.0, "step": 2100, "text_loss": 0.44390562176704407 @@ -19967,13 +19967,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056640625, "learning_rate": 0.0009398569963936589, - "loss": 0.0178, + "loss": 0.017, "macro_f1": 0.3272727429866791, "num_tokens": 3389958.0, "repeat_count": 0.0, - "routers_loss": 0.013569516129791737, + "routers_loss": 0.013722737319767475, "skip_count": 1.0, "step": 2102, "text_loss": 0.7207565903663635 @@ -19986,13 +19986,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009397097360322276, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3392892.0, "repeat_count": 0.0, - "routers_loss": 0.0044935219921171665, + "routers_loss": 0.002051608171314001, "skip_count": 0.0, "step": 2104, "text_loss": 0.3196398913860321 @@ -20005,13 +20005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07470703125, "learning_rate": 0.000939562307173196, - "loss": 0.0223, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 3396636.0, "repeat_count": 0.0, - "routers_loss": 0.007407462690025568, + "routers_loss": 0.007085663266479969, "skip_count": 0.0, "step": 2106, "text_loss": 0.5663776397705078 @@ -20024,13 +20024,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.13671875, + "grad_norm": 0.11328125, "learning_rate": 0.0009394147098730592, - "loss": 0.0205, + "loss": 0.02, "macro_f1": 0.5492662787437439, "num_tokens": 3399475.0, "repeat_count": 0.0, - "routers_loss": 0.024386432021856308, + "routers_loss": 0.019473131746053696, "skip_count": 2.0, "step": 2108, "text_loss": 0.7708223462104797 @@ -20043,32 +20043,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.038818359375, "learning_rate": 0.0009392669441883767, - "loss": 0.0135, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3402350.0, "repeat_count": 0.0, - "routers_loss": 0.002929724520072341, + "routers_loss": 0.0028328890912234783, "skip_count": 0.0, "step": 2110, "text_loss": 0.5888006091117859 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.915761667155856, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1201171875, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, "learning_rate": 0.0009391190101757724, - "loss": 0.0168, - "macro_f1": 0.5492662787437439, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, "num_tokens": 3405561.0, "repeat_count": 0.0, - "routers_loss": 0.026861928403377533, + "routers_loss": 0.023098422214388847, "skip_count": 2.0, "step": 2112, "text_loss": 0.09865197539329529 @@ -20081,13 +20081,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.10107421875, "learning_rate": 0.000938970907891935, - "loss": 0.0251, + "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 3408513.0, "repeat_count": 0.0, - "routers_loss": 0.0025369988288730383, + "routers_loss": 0.002896632067859173, "skip_count": 0.0, "step": 2114, "text_loss": 0.6613234281539917 @@ -20100,51 +20100,51 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.0947265625, "learning_rate": 0.0009388226373936179, - "loss": 0.0209, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 3411195.0, "repeat_count": 0.0, - "routers_loss": 0.014292459934949875, + "routers_loss": 0.015814457088708878, "skip_count": 0.0, "step": 2116, "text_loss": 0.17363053560256958 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 9.94393894922219, - "f1_execute": 0.9629629850387573, - "f1_repeat": 0.0, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009386741987376381, - "loss": 0.0151, - "macro_f1": 0.32098767161369324, + "loss": 0.015, + "macro_f1": 0.6603773832321167, "num_tokens": 3414875.0, "repeat_count": 1.0, - "routers_loss": 0.027571436017751694, + "routers_loss": 0.02676783688366413, "skip_count": 0.0, "step": 2118, "text_loss": 0.674056887626648 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.953331376577633, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, "learning_rate": 0.0009385255919808778, - "loss": 0.0205, - "macro_f1": 0.3272727429866791, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, "num_tokens": 3418410.0, "repeat_count": 0.0, - "routers_loss": 0.011719600297510624, + "routers_loss": 0.01022857241332531, "skip_count": 1.0, "step": 2120, "text_loss": 0.235092431306839 @@ -20157,13 +20157,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.0888671875, "learning_rate": 0.0009383768171802836, - "loss": 0.0249, + "loss": 0.0244, "macro_f1": 0.5492662787437439, "num_tokens": 3421289.0, "repeat_count": 0.0, - "routers_loss": 0.01207603607326746, + "routers_loss": 0.013572212308645248, "skip_count": 2.0, "step": 2122, "text_loss": 0.5992844104766846 @@ -20176,13 +20176,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.064453125, "learning_rate": 0.0009382278743928659, - "loss": 0.0206, + "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 3424781.0, "repeat_count": 0.0, - "routers_loss": 0.008004254661500454, + "routers_loss": 0.0051873656921088696, "skip_count": 2.0, "step": 2124, "text_loss": 0.29915499687194824 @@ -20195,13 +20195,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.07666015625, + "grad_norm": 0.07421875, "learning_rate": 0.0009380787636757001, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6122449040412903, "num_tokens": 3427942.0, "repeat_count": 0.0, - "routers_loss": 0.030767880380153656, + "routers_loss": 0.030079292133450508, "skip_count": 4.0, "step": 2126, "text_loss": 0.24181491136550903 @@ -20214,13 +20214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009379294850859256, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3431314.0, "repeat_count": 0.0, - "routers_loss": 0.002620625076815486, + "routers_loss": 0.002675612922757864, "skip_count": 0.0, "step": 2128, "text_loss": 0.4669873118400574 @@ -20233,13 +20233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009377800386807465, - "loss": 0.0175, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 3435020.0, "repeat_count": 0.0, - "routers_loss": 0.009095560759305954, + "routers_loss": 0.009334275498986244, "skip_count": 0.0, "step": 2130, "text_loss": 0.6478219628334045 @@ -20252,13 +20252,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.11865234375, + "grad_norm": 0.134765625, "learning_rate": 0.0009376304245174306, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.6000000238418579, "num_tokens": 3438276.0, "repeat_count": 1.0, - "routers_loss": 0.058448426425457, + "routers_loss": 0.038227908313274384, "skip_count": 2.0, "step": 2132, "text_loss": 0.4401201903820038 @@ -20271,13 +20271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.041748046875, "learning_rate": 0.0009374806426533104, - "loss": 0.0116, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3440938.0, "repeat_count": 0.0, - "routers_loss": 0.007323687430471182, + "routers_loss": 0.006901399698108435, "skip_count": 0.0, "step": 2134, "text_loss": 0.5948942303657532 @@ -20290,13 +20290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.051025390625, "learning_rate": 0.0009373306931457827, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3444028.0, "repeat_count": 0.0, - "routers_loss": 0.003302243771031499, + "routers_loss": 0.0037061909679323435, "skip_count": 0.0, "step": 2136, "text_loss": 0.5349751114845276 @@ -20309,13 +20309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.056884765625, "learning_rate": 0.0009371805760523086, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 3448331.0, "repeat_count": 0.0, - "routers_loss": 0.0027974818367511034, + "routers_loss": 0.0025877030566334724, "skip_count": 0.0, "step": 2138, "text_loss": 0.4591051936149597 @@ -20328,13 +20328,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.08642578125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009370302914304129, - "loss": 0.0145, + "loss": 0.0144, "macro_f1": 0.5934640765190125, "num_tokens": 3451434.0, "repeat_count": 0.0, - "routers_loss": 0.01572767272591591, + "routers_loss": 0.018742674961686134, "skip_count": 3.0, "step": 2140, "text_loss": 0.23470863699913025 @@ -20347,13 +20347,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009368798393376851, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 3454375.0, "repeat_count": 0.0, - "routers_loss": 0.020721890032291412, + "routers_loss": 0.02382594160735607, "skip_count": 1.0, "step": 2142, "text_loss": 0.6077954769134521 @@ -20366,13 +20366,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05859375, + "grad_norm": 0.05517578125, "learning_rate": 0.0009367292198317787, - "loss": 0.0161, + "loss": 0.0164, "macro_f1": 0.5492662787437439, "num_tokens": 3457591.0, "repeat_count": 0.0, - "routers_loss": 0.03272393345832825, + "routers_loss": 0.03331060707569122, "skip_count": 2.0, "step": 2144, "text_loss": 0.3691073954105377 @@ -20385,13 +20385,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009365784329704115, - "loss": 0.0191, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3460895.0, "repeat_count": 0.0, - "routers_loss": 0.0017473002662882209, + "routers_loss": 0.0016955457394942641, "skip_count": 0.0, "step": 2146, "text_loss": 0.3947436511516571 @@ -20404,13 +20404,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.050537109375, "learning_rate": 0.0009364274788113651, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3464101.0, "repeat_count": 1.0, - "routers_loss": 0.008070237934589386, + "routers_loss": 0.006169239990413189, "skip_count": 0.0, "step": 2148, "text_loss": 0.3348555266857147 @@ -20423,13 +20423,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.068359375, "learning_rate": 0.0009362763574124858, - "loss": 0.0191, + "loss": 0.019, "macro_f1": 0.9265305995941162, "num_tokens": 3467417.0, "repeat_count": 3.0, - "routers_loss": 0.021709222346544266, + "routers_loss": 0.024033790454268456, "skip_count": 1.0, "step": 2150, "text_loss": 0.496633380651474 @@ -20442,13 +20442,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.042724609375, "learning_rate": 0.0009361250688316829, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3470917.0, "repeat_count": 0.0, - "routers_loss": 0.0022237664088606834, + "routers_loss": 0.0024986129719763994, "skip_count": 0.0, "step": 2152, "text_loss": 0.6857671737670898 @@ -20461,13 +20461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0546875, "learning_rate": 0.0009359736131269312, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3473624.0, "repeat_count": 0.0, - "routers_loss": 0.00838750321418047, + "routers_loss": 0.008183322846889496, "skip_count": 1.0, "step": 2154, "text_loss": 0.13883116841316223 @@ -20480,13 +20480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.06640625, "learning_rate": 0.0009358219903562684, - "loss": 0.01, + "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3476472.0, "repeat_count": 0.0, - "routers_loss": 0.010190514847636223, + "routers_loss": 0.011198793537914753, "skip_count": 3.0, "step": 2156, "text_loss": 0.24243666231632233 @@ -20499,13 +20499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.04296875, "learning_rate": 0.0009356702005777969, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3479688.0, "repeat_count": 0.0, - "routers_loss": 0.002411153633147478, + "routers_loss": 0.002520184963941574, "skip_count": 0.0, "step": 2158, "text_loss": 0.6407818794250488 @@ -20518,13 +20518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009355182438496825, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3482598.0, "repeat_count": 0.0, - "routers_loss": 0.001032356172800064, + "routers_loss": 0.0011065017897635698, "skip_count": 0.0, "step": 2160, "text_loss": 0.7214245796203613 @@ -20537,13 +20537,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009353661202301557, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 3486271.0, "repeat_count": 0.0, - "routers_loss": 0.0022046815138310194, + "routers_loss": 0.0017824085662141442, "skip_count": 0.0, "step": 2162, "text_loss": 0.5140969157218933 @@ -20556,32 +20556,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.053466796875, "learning_rate": 0.0009352138297775101, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3489206.0, "repeat_count": 0.0, - "routers_loss": 0.0014977266546338797, + "routers_loss": 0.001542879967018962, "skip_count": 0.0, "step": 2164, "text_loss": 0.7956416606903076 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 1.0, + "avg_layers": 25.0, "epoch": 10.169063692398003, - "f1_execute": 0.9803921580314636, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, + "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000935061372550104, - "loss": 0.0132, - "macro_f1": 0.5934640765190125, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, "num_tokens": 3492003.0, "repeat_count": 0.0, - "routers_loss": 0.016847684979438782, + "routers_loss": 0.01420794241130352, "skip_count": 3.0, "step": 2166, "text_loss": 0.27489882707595825 @@ -20594,13 +20594,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009349087486063594, - "loss": 0.0168, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3494784.0, "repeat_count": 0.0, - "routers_loss": 0.0036806222051382065, + "routers_loss": 0.003614309709519148, "skip_count": 1.0, "step": 2168, "text_loss": 0.2962227761745453 @@ -20613,13 +20613,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1259765625, "learning_rate": 0.0009347559580047618, - "loss": 0.0174, + "loss": 0.0175, "macro_f1": 0.8814815282821655, "num_tokens": 3497886.0, "repeat_count": 2.0, - "routers_loss": 0.021412594243884087, + "routers_loss": 0.02122853323817253, "skip_count": 4.0, "step": 2170, "text_loss": 0.5919580459594727 @@ -20627,18 +20627,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 10.197240974464338, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, "learning_rate": 0.000934603000803861, - "loss": 0.0134, - "macro_f1": 0.6666666865348816, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, "num_tokens": 3500939.0, "repeat_count": 0.0, - "routers_loss": 0.0201424453407526, + "routers_loss": 0.02042219042778015, "skip_count": 1.0, "step": 2172, "text_loss": 0.28722381591796875 @@ -20651,13 +20651,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009344498770622704, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3504852.0, "repeat_count": 0.0, - "routers_loss": 0.005059401970356703, + "routers_loss": 0.004345106892287731, "skip_count": 0.0, "step": 2174, "text_loss": 0.603236734867096 @@ -20670,13 +20670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1064453125, "learning_rate": 0.0009342965868386673, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3508320.0, "repeat_count": 0.0, - "routers_loss": 0.004006600938737392, + "routers_loss": 0.00368050136603415, "skip_count": 0.0, "step": 2176, "text_loss": 0.6020491719245911 @@ -20691,11 +20691,11 @@ "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000934143130191793, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 3511278.0, "repeat_count": 0.0, - "routers_loss": 0.013246738351881504, + "routers_loss": 0.013425769284367561, "skip_count": 0.0, "step": 2178, "text_loss": 0.5954724550247192 @@ -20708,13 +20708,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.060546875, "learning_rate": 0.000933989507180452, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 3514361.0, "repeat_count": 0.0, - "routers_loss": 0.0031937146559357643, + "routers_loss": 0.002896249992772937, "skip_count": 0.0, "step": 2180, "text_loss": 0.39175131916999817 @@ -20727,13 +20727,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0556640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009338357178635135, - "loss": 0.0151, + "loss": 0.0147, "macro_f1": 0.6603773832321167, "num_tokens": 3517962.0, "repeat_count": 1.0, - "routers_loss": 0.014782631769776344, + "routers_loss": 0.011538350023329258, "skip_count": 1.0, "step": 2182, "text_loss": 0.4482830762863159 @@ -20746,13 +20746,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0009336817622999093, - "loss": 0.0112, + "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 3521299.0, "repeat_count": 1.0, - "routers_loss": 0.02318345196545124, + "routers_loss": 0.022787930443882942, "skip_count": 0.0, "step": 2184, "text_loss": 0.35177817940711975 @@ -20765,13 +20765,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009335276405486357, - "loss": 0.0134, + "loss": 0.0139, "macro_f1": 0.3272727429866791, "num_tokens": 3524611.0, "repeat_count": 0.0, - "routers_loss": 0.011735675856471062, + "routers_loss": 0.011597735807299614, "skip_count": 1.0, "step": 2186, "text_loss": 0.24868851900100708 @@ -20784,13 +20784,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009333733526687524, - "loss": 0.0198, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 3528012.0, "repeat_count": 0.0, - "routers_loss": 0.01558679062873125, + "routers_loss": 0.014253967441618443, "skip_count": 0.0, "step": 2188, "text_loss": 0.3970910310745239 @@ -20803,13 +20803,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.054931640625, "learning_rate": 0.000933218898719383, - "loss": 0.0163, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3530908.0, "repeat_count": 0.0, - "routers_loss": 0.0019149131840094924, + "routers_loss": 0.001659149187617004, "skip_count": 0.0, "step": 2190, "text_loss": 0.7618573307991028 @@ -20822,13 +20822,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0693359375, "learning_rate": 0.0009330642787597141, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3533993.0, "repeat_count": 0.0, - "routers_loss": 0.0056966920383274555, + "routers_loss": 0.005574346985667944, "skip_count": 0.0, "step": 2192, "text_loss": 0.16470147669315338 @@ -20841,13 +20841,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009329094928489969, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3537310.0, "repeat_count": 0.0, - "routers_loss": 0.002511024009436369, + "routers_loss": 0.0026400673668831587, "skip_count": 0.0, "step": 2194, "text_loss": 0.3400416374206543 @@ -20860,13 +20860,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08935546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009327545410465452, - "loss": 0.0126, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3540045.0, "repeat_count": 0.0, - "routers_loss": 0.008584192954003811, + "routers_loss": 0.008448398672044277, "skip_count": 3.0, "step": 2196, "text_loss": 0.3110542297363281 @@ -20879,13 +20879,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.04638671875, "learning_rate": 0.0009325994234117372, - "loss": 0.0129, + "loss": 0.0122, "macro_f1": 0.32098764181137085, "num_tokens": 3544097.0, "repeat_count": 0.0, - "routers_loss": 0.03748156875371933, + "routers_loss": 0.037553198635578156, "skip_count": 2.0, "step": 2198, "text_loss": 0.36126700043678284 @@ -20898,13 +20898,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.09716796875, "learning_rate": 0.000932444140004014, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3547054.0, "repeat_count": 1.0, - "routers_loss": 0.006402099970728159, + "routers_loss": 0.006464479025453329, "skip_count": 0.0, "step": 2200, "text_loss": 0.4947047233581543 @@ -20917,13 +20917,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1015625, "learning_rate": 0.0009322886908828805, - "loss": 0.015, + "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3549903.0, "repeat_count": 1.0, - "routers_loss": 0.0055928584188222885, + "routers_loss": 0.005384812597185373, "skip_count": 0.0, "step": 2202, "text_loss": 0.5923738479614258 @@ -20936,13 +20936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009321330761079052, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3553745.0, "repeat_count": 0.0, - "routers_loss": 0.013155708089470863, + "routers_loss": 0.015346619300544262, "skip_count": 2.0, "step": 2204, "text_loss": 0.1904175877571106 @@ -20955,13 +20955,13 @@ "f1_execute": 0.9268292784690857, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.06884765625, + "grad_norm": 0.06494140625, "learning_rate": 0.00093197729573872, - "loss": 0.0206, + "loss": 0.0203, "macro_f1": 0.8422764539718628, "num_tokens": 3557235.0, "repeat_count": 3.0, - "routers_loss": 0.12029488384723663, + "routers_loss": 0.1207597479224205, "skip_count": 6.0, "step": 2206, "text_loss": 0.3904837667942047 @@ -20974,13 +20974,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0771484375, "learning_rate": 0.0009318213498350202, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3560795.0, "repeat_count": 0.0, - "routers_loss": 0.0037007431965321302, + "routers_loss": 0.003334777895361185, "skip_count": 0.0, "step": 2208, "text_loss": 0.4268290102481842 @@ -20993,13 +20993,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.0537109375, "learning_rate": 0.0009316652384565645, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3563754.0, "repeat_count": 0.0, - "routers_loss": 0.004071404226124287, + "routers_loss": 0.004230072256177664, "skip_count": 0.0, "step": 2210, "text_loss": 0.40049710869789124 @@ -21012,13 +21012,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046875, "learning_rate": 0.0009315089616631751, - "loss": 0.0103, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 3567173.0, "repeat_count": 0.0, - "routers_loss": 0.0006955390563234687, + "routers_loss": 0.0006645230459980667, "skip_count": 0.0, "step": 2212, "text_loss": 0.42568323016166687 @@ -21031,32 +21031,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.07470703125, "learning_rate": 0.0009313525195147376, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3570831.0, "repeat_count": 0.0, - "routers_loss": 0.010293997824192047, + "routers_loss": 0.0097877848893404, "skip_count": 0.0, "step": 2214, "text_loss": 0.45808279514312744 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 10.40387437628412, - "f1_execute": 0.9583333134651184, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07470703125, + "grad_norm": 0.076171875, "learning_rate": 0.000931195912071201, - "loss": 0.0185, - "macro_f1": 0.8194444179534912, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, "num_tokens": 3573745.0, "repeat_count": 2.0, - "routers_loss": 0.06593773514032364, + "routers_loss": 0.07351134717464447, "skip_count": 3.0, "step": 2216, "text_loss": 0.285696804523468 @@ -21069,13 +21069,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009310391393925775, - "loss": 0.013, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3576785.0, "repeat_count": 0.0, - "routers_loss": 0.00347105972468853, + "routers_loss": 0.0033160944003611803, "skip_count": 0.0, "step": 2218, "text_loss": 0.17516443133354187 @@ -21088,32 +21088,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04736328125, + "grad_norm": 0.047119140625, "learning_rate": 0.0009308822015389424, - "loss": 0.0244, + "loss": 0.0241, "macro_f1": 0.5427350401878357, "num_tokens": 3580695.0, "repeat_count": 1.0, - "routers_loss": 0.04871147498488426, + "routers_loss": 0.052930232137441635, "skip_count": 1.0, "step": 2220, "text_loss": 0.5918155908584595 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.75, + "avg_layers": 25.0, "epoch": 10.432051658350455, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.05517578125, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, "learning_rate": 0.0009307250985704352, - "loss": 0.012, - "macro_f1": 0.542222261428833, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, "num_tokens": 3583729.0, "repeat_count": 0.0, - "routers_loss": 0.024859672412276268, + "routers_loss": 0.025454653427004814, "skip_count": 4.0, "step": 2222, "text_loss": 0.2652169466018677 @@ -21126,13 +21126,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.052001953125, "learning_rate": 0.0009305678305472575, - "loss": 0.016, + "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 3586775.0, "repeat_count": 0.0, - "routers_loss": 0.010990055277943611, + "routers_loss": 0.011279845610260963, "skip_count": 0.0, "step": 2224, "text_loss": 0.3511691987514496 @@ -21145,13 +21145,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.10791015625, "learning_rate": 0.000930410397529675, - "loss": 0.0171, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3589676.0, "repeat_count": 0.0, - "routers_loss": 0.0025031559634953737, + "routers_loss": 0.002700264798477292, "skip_count": 0.0, "step": 2226, "text_loss": 0.24045433104038239 @@ -21164,13 +21164,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.048095703125, "learning_rate": 0.000930252799578016, - "loss": 0.0147, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 3593242.0, "repeat_count": 1.0, - "routers_loss": 0.008100497536361217, + "routers_loss": 0.00826631672680378, "skip_count": 2.0, "step": 2228, "text_loss": 0.3777645528316498 @@ -21183,13 +21183,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009300950367526728, - "loss": 0.0128, + "loss": 0.0131, "macro_f1": 0.8820862174034119, "num_tokens": 3596807.0, "repeat_count": 2.0, - "routers_loss": 0.03150207921862602, + "routers_loss": 0.036221496760845184, "skip_count": 2.0, "step": 2230, "text_loss": 0.502962589263916 @@ -21202,13 +21202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.0703125, "learning_rate": 0.0009299371091141001, - "loss": 0.0132, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3600150.0, "repeat_count": 0.0, - "routers_loss": 0.006253884173929691, + "routers_loss": 0.006449893582612276, "skip_count": 0.0, "step": 2232, "text_loss": 0.20256924629211426 @@ -21221,13 +21221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.04638671875, "learning_rate": 0.0009297790167228161, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3602988.0, "repeat_count": 0.0, - "routers_loss": 0.007228068076074123, + "routers_loss": 0.007872486487030983, "skip_count": 2.0, "step": 2234, "text_loss": 0.42476826906204224 @@ -21240,13 +21240,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009296207596394022, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 0.32098764181137085, "num_tokens": 3606071.0, "repeat_count": 0.0, - "routers_loss": 0.02524643763899803, + "routers_loss": 0.027397040277719498, "skip_count": 2.0, "step": 2236, "text_loss": 0.23432791233062744 @@ -21259,13 +21259,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009294623379245028, - "loss": 0.0119, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3609389.0, "repeat_count": 0.0, - "routers_loss": 0.009672109968960285, + "routers_loss": 0.01042645052075386, "skip_count": 0.0, "step": 2238, "text_loss": 0.16665785014629364 @@ -21278,13 +21278,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009293037516388252, - "loss": 0.0155, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3612105.0, "repeat_count": 0.0, - "routers_loss": 0.0010066524846479297, + "routers_loss": 0.0012458425480872393, "skip_count": 0.0, "step": 2240, "text_loss": 0.59421306848526 @@ -21297,13 +21297,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0751953125, "learning_rate": 0.0009291450008431404, - "loss": 0.0184, + "loss": 0.0185, "macro_f1": 1.0, "num_tokens": 3615439.0, "repeat_count": 1.0, - "routers_loss": 0.005509128328412771, + "routers_loss": 0.005781981628388166, "skip_count": 1.0, "step": 2242, "text_loss": 0.510798454284668 @@ -21316,13 +21316,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.09423828125, + "grad_norm": 0.0966796875, "learning_rate": 0.0009289860855982814, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.4871794879436493, "num_tokens": 3618842.0, "repeat_count": 0.0, - "routers_loss": 0.030802007764577866, + "routers_loss": 0.031195320188999176, "skip_count": 3.0, "step": 2244, "text_loss": 0.7574363350868225 @@ -21335,13 +21335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.04931640625, "learning_rate": 0.0009288270059651454, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 3621823.0, "repeat_count": 0.0, - "routers_loss": 0.001686889911070466, + "routers_loss": 0.001746491645462811, "skip_count": 0.0, "step": 2246, "text_loss": 0.5125683546066284 @@ -21354,13 +21354,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.220703125, "learning_rate": 0.0009286677620046918, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3624502.0, "repeat_count": 0.0, - "routers_loss": 0.03299177065491676, + "routers_loss": 0.03792348504066467, "skip_count": 2.0, "step": 2248, "text_loss": 0.7533677220344543 @@ -21375,11 +21375,11 @@ "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009285083537779429, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3627057.0, "repeat_count": 0.0, - "routers_loss": 0.0010354233672842383, + "routers_loss": 0.0009684451506473124, "skip_count": 0.0, "step": 2250, "text_loss": 0.2219279706478119 @@ -21392,13 +21392,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10205078125, + "grad_norm": 0.11767578125, "learning_rate": 0.0009283487813459845, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.5492662787437439, "num_tokens": 3629720.0, "repeat_count": 0.0, - "routers_loss": 0.02196674607694149, + "routers_loss": 0.022757573053240776, "skip_count": 2.0, "step": 2252, "text_loss": 0.6903313994407654 @@ -21411,13 +21411,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009281890447699652, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 3633234.0, "repeat_count": 1.0, - "routers_loss": 0.002239946974441409, + "routers_loss": 0.003613058477640152, "skip_count": 0.0, "step": 2254, "text_loss": 0.6278893351554871 @@ -21430,13 +21430,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009280291441110961, - "loss": 0.0117, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3636289.0, "repeat_count": 0.0, - "routers_loss": 0.0063575254753232, + "routers_loss": 0.006214062683284283, "skip_count": 0.0, "step": 2256, "text_loss": 0.3011114001274109 @@ -21449,13 +21449,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.040283203125, + "grad_norm": 0.041015625, "learning_rate": 0.0009278690794306517, - "loss": 0.0143, + "loss": 0.014, "macro_f1": 0.5492662787437439, "num_tokens": 3640251.0, "repeat_count": 0.0, - "routers_loss": 0.0524379126727581, + "routers_loss": 0.052556321024894714, "skip_count": 2.0, "step": 2258, "text_loss": 0.19894185662269592 @@ -21468,13 +21468,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.08251953125, "learning_rate": 0.0009277088507899689, - "loss": 0.0156, + "loss": 0.0163, "macro_f1": 0.9452888369560242, "num_tokens": 3643527.0, "repeat_count": 4.0, - "routers_loss": 0.052486274391412735, + "routers_loss": 0.0572301521897316, "skip_count": 1.0, "step": 2260, "text_loss": 0.5593410134315491 @@ -21487,13 +21487,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009275484582504475, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3646959.0, "repeat_count": 0.0, - "routers_loss": 0.006877690553665161, + "routers_loss": 0.008010074496269226, "skip_count": 0.0, "step": 2262, "text_loss": 0.2128177285194397 @@ -21506,13 +21506,13 @@ "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.05322265625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009273879018735505, - "loss": 0.0136, + "loss": 0.0138, "macro_f1": 0.8521739840507507, "num_tokens": 3651298.0, "repeat_count": 3.0, - "routers_loss": 0.03128742054104805, + "routers_loss": 0.035729870200157166, "skip_count": 3.0, "step": 2264, "text_loss": 0.2987811267375946 @@ -21525,13 +21525,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009272271817208031, - "loss": 0.0188, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 3655609.0, "repeat_count": 0.0, - "routers_loss": 0.0028425443451851606, + "routers_loss": 0.002379779238253832, "skip_count": 0.0, "step": 2266, "text_loss": 0.6024088263511658 @@ -21544,13 +21544,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009270662978537939, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3658444.0, "repeat_count": 0.0, - "routers_loss": 0.009712206199765205, + "routers_loss": 0.008943650871515274, "skip_count": 0.0, "step": 2268, "text_loss": 0.1741207242012024 @@ -21563,13 +21563,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.053955078125, "learning_rate": 0.0009269052503341736, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6595745086669922, "num_tokens": 3662282.0, "repeat_count": 1.0, - "routers_loss": 0.03980376198887825, + "routers_loss": 0.030201267451047897, "skip_count": 4.0, "step": 2270, "text_loss": 0.7300035953521729 @@ -21582,13 +21582,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.049072265625, "learning_rate": 0.0009267440392236562, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3665531.0, "repeat_count": 0.0, - "routers_loss": 0.0030603872146457434, + "routers_loss": 0.0026635683607310057, "skip_count": 0.0, "step": 2272, "text_loss": 0.31535038352012634 @@ -21601,13 +21601,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009265826645840178, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 3668407.0, "repeat_count": 0.0, - "routers_loss": 0.004795679822564125, + "routers_loss": 0.004258926957845688, "skip_count": 0.0, "step": 2274, "text_loss": 0.7272579073905945 @@ -21620,13 +21620,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.1435546875, + "grad_norm": 0.125, "learning_rate": 0.0009264211264770976, - "loss": 0.0155, + "loss": 0.0154, "macro_f1": 0.6122449040412903, "num_tokens": 3671503.0, "repeat_count": 0.0, - "routers_loss": 0.0340447798371315, + "routers_loss": 0.038987524807453156, "skip_count": 4.0, "step": 2276, "text_loss": 0.7488982677459717 @@ -21639,13 +21639,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0009262594249647975, - "loss": 0.016, + "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 3674107.0, "repeat_count": 0.0, - "routers_loss": 0.007436402142047882, + "routers_loss": 0.007211760152131319, "skip_count": 1.0, "step": 2278, "text_loss": 0.1992369294166565 @@ -21658,13 +21658,13 @@ "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0546875, "learning_rate": 0.0009260975601090815, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.9446290731430054, "num_tokens": 3677184.0, "repeat_count": 4.0, - "routers_loss": 0.02465176396071911, + "routers_loss": 0.02538592554628849, "skip_count": 3.0, "step": 2280, "text_loss": 0.46402135491371155 @@ -21677,13 +21677,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009259355319719768, - "loss": 0.0167, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3680683.0, "repeat_count": 0.0, - "routers_loss": 0.0037910486571490765, + "routers_loss": 0.0038464947137981653, "skip_count": 0.0, "step": 2282, "text_loss": 0.5804527401924133 @@ -21696,13 +21696,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009257733406155726, - "loss": 0.0161, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3683928.0, "repeat_count": 0.0, - "routers_loss": 0.003716849023476243, + "routers_loss": 0.004841136280447245, "skip_count": 0.0, "step": 2284, "text_loss": 0.4834538400173187 @@ -21715,13 +21715,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009256109861020212, - "loss": 0.0118, + "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3687101.0, "repeat_count": 0.0, - "routers_loss": 0.0021690395660698414, + "routers_loss": 0.002191900508478284, "skip_count": 0.0, "step": 2286, "text_loss": 0.8199604749679565 @@ -21734,13 +21734,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000925448468493537, "loss": 0.0162, "macro_f1": 0.5427350401878357, "num_tokens": 3690490.0, "repeat_count": 1.0, - "routers_loss": 0.034040264785289764, + "routers_loss": 0.03488675877451897, "skip_count": 2.0, "step": 2288, "text_loss": 0.33263635635375977 @@ -21753,32 +21753,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009252857878523971, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3694109.0, "repeat_count": 1.0, - "routers_loss": 0.0027822356205433607, + "routers_loss": 0.002897309372201562, "skip_count": 0.0, "step": 2290, "text_loss": 0.47494807839393616 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 10.760786615791018, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, "learning_rate": 0.000925122944240941, - "loss": 0.0156, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3697233.0, "repeat_count": 0.0, - "routers_loss": 0.020813947543501854, + "routers_loss": 0.01842675730586052, "skip_count": 2.0, "step": 2292, "text_loss": 0.14693495631217957 @@ -21791,13 +21791,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.042236328125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009249599377215707, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 0.5866667032241821, "num_tokens": 3700376.0, "repeat_count": 1.0, - "routers_loss": 0.038725610822439194, + "routers_loss": 0.04169808700680733, "skip_count": 3.0, "step": 2294, "text_loss": 0.38051268458366394 @@ -21810,13 +21810,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.05908203125, "learning_rate": 0.0009247967683567507, - "loss": 0.0117, + "loss": 0.0112, "macro_f1": 0.3272727429866791, "num_tokens": 3703212.0, "repeat_count": 0.0, - "routers_loss": 0.01360203418880701, + "routers_loss": 0.012183113023638725, "skip_count": 1.0, "step": 2296, "text_loss": 0.23789077997207642 @@ -21829,13 +21829,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0498046875, + "grad_norm": 0.05712890625, "learning_rate": 0.0009246334362090077, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3706490.0, "repeat_count": 1.0, - "routers_loss": 0.021909991279244423, + "routers_loss": 0.01880069635808468, "skip_count": 2.0, "step": 2298, "text_loss": 0.29067978262901306 @@ -21848,13 +21848,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.000924469941340931, - "loss": 0.0175, + "loss": 0.0173, "macro_f1": 0.3272727429866791, "num_tokens": 3709804.0, "repeat_count": 1.0, - "routers_loss": 0.03153124824166298, + "routers_loss": 0.027359159663319588, "skip_count": 0.0, "step": 2300, "text_loss": 0.67828369140625 @@ -21867,13 +21867,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.07275390625, "learning_rate": 0.000924306283815172, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.3333333432674408, "num_tokens": 3712824.0, "repeat_count": 0.0, - "routers_loss": 0.0034419491421431303, + "routers_loss": 0.003152279881760478, "skip_count": 0.0, "step": 2302, "text_loss": 0.8333184719085693 @@ -21886,13 +21886,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.061767578125, + "grad_norm": 0.0703125, "learning_rate": 0.0009241424636944445, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3715385.0, "repeat_count": 0.0, - "routers_loss": 0.03655214607715607, + "routers_loss": 0.0442950464785099, "skip_count": 2.0, "step": 2304, "text_loss": 0.41893699765205383 @@ -21905,13 +21905,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0576171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009239784810415249, - "loss": 0.014, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3719080.0, "repeat_count": 1.0, - "routers_loss": 0.015360959805548191, + "routers_loss": 0.015729321166872978, "skip_count": 2.0, "step": 2306, "text_loss": 0.13360483944416046 @@ -21924,13 +21924,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0537109375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009238143359192514, "loss": 0.0136, "macro_f1": 0.5934640765190125, "num_tokens": 3722439.0, "repeat_count": 0.0, - "routers_loss": 0.027275927364826202, + "routers_loss": 0.028816604986786842, "skip_count": 3.0, "step": 2308, "text_loss": 0.39594101905822754 @@ -21943,13 +21943,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0546875, + "grad_norm": 0.05419921875, "learning_rate": 0.000923650028390525, - "loss": 0.0163, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3725092.0, "repeat_count": 0.0, - "routers_loss": 0.003742894157767296, + "routers_loss": 0.0036455015651881695, "skip_count": 2.0, "step": 2310, "text_loss": 0.6169708371162415 @@ -21962,13 +21962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009234855585183086, - "loss": 0.0135, + "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3728412.0, "repeat_count": 0.0, - "routers_loss": 0.009356650523841381, + "routers_loss": 0.007565604057163, "skip_count": 1.0, "step": 2312, "text_loss": 0.21257059276103973 @@ -21983,11 +21983,11 @@ "f1_skip": 0.800000011920929, "grad_norm": 0.0517578125, "learning_rate": 0.0009233209263656273, - "loss": 0.0189, + "loss": 0.0184, "macro_f1": 0.9262410998344421, "num_tokens": 3731467.0, "repeat_count": 2.0, - "routers_loss": 0.02852487564086914, + "routers_loss": 0.02510629966855049, "skip_count": 3.0, "step": 2314, "text_loss": 0.21639840304851532 @@ -22000,13 +22000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05859375, + "grad_norm": 0.057861328125, "learning_rate": 0.0009231561319955684, - "loss": 0.0151, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3734906.0, "repeat_count": 0.0, - "routers_loss": 0.007533316500484943, + "routers_loss": 0.00872227642685175, "skip_count": 0.0, "step": 2316, "text_loss": 0.35639774799346924 @@ -22019,13 +22019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.08349609375, "learning_rate": 0.0009229911754712815, "loss": 0.0176, "macro_f1": 0.3333333432674408, "num_tokens": 3737943.0, "repeat_count": 0.0, - "routers_loss": 0.004666361026465893, + "routers_loss": 0.004695790819823742, "skip_count": 0.0, "step": 2318, "text_loss": 0.5269573330879211 @@ -22038,32 +22038,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.036376953125, "learning_rate": 0.0009228260568559781, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 3741833.0, "repeat_count": 1.0, - "routers_loss": 0.020992714911699295, + "routers_loss": 0.0217357836663723, "skip_count": 0.0, "step": 2320, "text_loss": 0.5110208988189697 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 10.901673026122689, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "f1_skip": 0.0, + "grad_norm": 0.1953125, "learning_rate": 0.0009226607762129322, - "loss": 0.0204, - "macro_f1": 0.6603773832321167, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, "num_tokens": 3744642.0, "repeat_count": 1.0, - "routers_loss": 0.047016773372888565, + "routers_loss": 0.05595960095524788, "skip_count": 1.0, "step": 2322, "text_loss": 0.6291998624801636 @@ -22078,11 +22078,11 @@ "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009224953336054796, - "loss": 0.0156, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3748127.0, "repeat_count": 0.0, - "routers_loss": 0.006612313445657492, + "routers_loss": 0.0071634589694440365, "skip_count": 0.0, "step": 2324, "text_loss": 0.7404762506484985 @@ -22095,13 +22095,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.050537109375, "learning_rate": 0.000922329729097018, - "loss": 0.0164, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3751373.0, "repeat_count": 0.0, - "routers_loss": 0.0012452995870262384, + "routers_loss": 0.0011676300782710314, "skip_count": 0.0, "step": 2326, "text_loss": 0.2915459871292114 @@ -22114,13 +22114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.061279296875, "learning_rate": 0.0009221639627510075, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3754518.0, "repeat_count": 0.0, - "routers_loss": 0.011379311792552471, + "routers_loss": 0.01039792038500309, "skip_count": 0.0, "step": 2328, "text_loss": 0.22066321969032288 @@ -22133,13 +22133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0751953125, "learning_rate": 0.0009219980346309702, - "loss": 0.0127, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3757621.0, "repeat_count": 0.0, - "routers_loss": 0.002973968628793955, + "routers_loss": 0.0032070958986878395, "skip_count": 0.0, "step": 2330, "text_loss": 0.5558560490608215 @@ -22152,13 +22152,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.076171875, "learning_rate": 0.0009218319448004899, - "loss": 0.012, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3760885.0, "repeat_count": 0.0, - "routers_loss": 0.00768645154312253, + "routers_loss": 0.007085457909852266, "skip_count": 0.0, "step": 2332, "text_loss": 0.4348253607749939 @@ -22171,13 +22171,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009216656933232129, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 3764462.0, "repeat_count": 0.0, - "routers_loss": 0.006761785596609116, + "routers_loss": 0.005504854489117861, "skip_count": 1.0, "step": 2334, "text_loss": 0.35828644037246704 @@ -22190,13 +22190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.05615234375, "learning_rate": 0.0009214992802628463, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3767159.0, "repeat_count": 0.0, - "routers_loss": 0.0013711688807234168, + "routers_loss": 0.0013970810687169433, "skip_count": 0.0, "step": 2336, "text_loss": 0.2956557869911194 @@ -22209,13 +22209,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08203125, "learning_rate": 0.0009213327056831607, - "loss": 0.0174, + "loss": 0.0181, "macro_f1": 0.3272727429866791, "num_tokens": 3770408.0, "repeat_count": 0.0, - "routers_loss": 0.04009406641125679, + "routers_loss": 0.0427570566534996, "skip_count": 1.0, "step": 2338, "text_loss": 0.14883014559745789 @@ -22228,13 +22228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.041015625, "learning_rate": 0.0009211659696479875, - "loss": 0.0095, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3773474.0, "repeat_count": 0.0, - "routers_loss": 0.0013272224459797144, + "routers_loss": 0.0011273405980318785, "skip_count": 0.0, "step": 2340, "text_loss": 0.26011669635772705 @@ -22249,11 +22249,11 @@ "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.00092099907222122, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3776909.0, "repeat_count": 0.0, - "routers_loss": 0.001724833040498197, + "routers_loss": 0.0016178421210497618, "skip_count": 0.0, "step": 2342, "text_loss": 0.49078530073165894 @@ -22266,13 +22266,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.051025390625, "learning_rate": 0.000920832013466814, - "loss": 0.0132, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 3780741.0, "repeat_count": 0.0, - "routers_loss": 0.005641496740281582, + "routers_loss": 0.005510095041245222, "skip_count": 0.0, "step": 2344, "text_loss": 0.4870249927043915 @@ -22285,13 +22285,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.037109375, "learning_rate": 0.0009206647934487866, - "loss": 0.011, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3784673.0, "repeat_count": 1.0, - "routers_loss": 0.003907595761120319, + "routers_loss": 0.0047357892617583275, "skip_count": 0.0, "step": 2346, "text_loss": 0.3251725733280182 @@ -22304,13 +22304,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.05615234375, "learning_rate": 0.0009204974122312167, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3787503.0, "repeat_count": 0.0, - "routers_loss": 0.007570050656795502, + "routers_loss": 0.00795028731226921, "skip_count": 1.0, "step": 2348, "text_loss": 0.18282145261764526 @@ -22323,13 +22323,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.060546875, "learning_rate": 0.0009203298698782452, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3790528.0, "repeat_count": 1.0, - "routers_loss": 0.0009280897793360054, + "routers_loss": 0.0009506374481134117, "skip_count": 0.0, "step": 2350, "text_loss": 0.4093080461025238 @@ -22342,13 +22342,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.047607421875, "learning_rate": 0.0009201621664540747, "loss": 0.0155, "macro_f1": 0.6666666865348816, "num_tokens": 3794134.0, "repeat_count": 1.0, - "routers_loss": 0.005288597662001848, + "routers_loss": 0.005159572698175907, "skip_count": 0.0, "step": 2352, "text_loss": 0.5451981425285339 @@ -22361,13 +22361,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009199943020229694, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3797414.0, "repeat_count": 0.0, - "routers_loss": 0.002237799344584346, + "routers_loss": 0.002356168581172824, "skip_count": 0.0, "step": 2354, "text_loss": 0.3070453405380249 @@ -22380,13 +22380,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0810546875, "learning_rate": 0.0009198262766492554, - "loss": 0.0144, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 3800094.0, "repeat_count": 0.0, - "routers_loss": 0.006226782687008381, + "routers_loss": 0.0051761893555521965, "skip_count": 1.0, "step": 2356, "text_loss": 0.5880904197692871 @@ -22399,13 +22399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.049560546875, "learning_rate": 0.00091965809039732, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3803280.0, "repeat_count": 0.0, - "routers_loss": 0.0027645498048514128, + "routers_loss": 0.0025952060241252184, "skip_count": 0.0, "step": 2358, "text_loss": 0.5210731625556946 @@ -22418,13 +22418,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009194897433316127, - "loss": 0.0122, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 3805866.0, "repeat_count": 0.0, - "routers_loss": 0.0034913592971861362, + "routers_loss": 0.0042560105212032795, "skip_count": 2.0, "step": 2360, "text_loss": 0.6472984552383423 @@ -22437,13 +22437,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07568359375, "learning_rate": 0.0009193212355166446, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3808952.0, "repeat_count": 0.0, - "routers_loss": 0.002706601284444332, + "routers_loss": 0.0026232977397739887, "skip_count": 0.0, "step": 2362, "text_loss": 0.450063556432724 @@ -22456,13 +22456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009191525670169881, - "loss": 0.0108, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3812080.0, "repeat_count": 0.0, - "routers_loss": 0.0032696903217583895, + "routers_loss": 0.0034355956595391035, "skip_count": 0.0, "step": 2364, "text_loss": 0.49727216362953186 @@ -22475,13 +22475,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.05908203125, "learning_rate": 0.000918983737897277, - "loss": 0.0115, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3815282.0, "repeat_count": 0.0, - "routers_loss": 0.006245410069823265, + "routers_loss": 0.0055653867311775684, "skip_count": 1.0, "step": 2366, "text_loss": 0.6336377859115601 @@ -22496,11 +22496,11 @@ "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0009188147482222071, - "loss": 0.0079, + "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3818106.0, "repeat_count": 2.0, - "routers_loss": 0.011230813339352608, + "routers_loss": 0.011016021482646465, "skip_count": 2.0, "step": 2368, "text_loss": 0.22513329982757568 @@ -22515,11 +22515,11 @@ "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009186455980565358, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3821228.0, "repeat_count": 1.0, - "routers_loss": 0.014897257089614868, + "routers_loss": 0.014039464294910431, "skip_count": 0.0, "step": 2370, "text_loss": 0.21331638097763062 @@ -22532,13 +22532,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.062255859375, "learning_rate": 0.0009184762874650816, - "loss": 0.0131, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3825048.0, "repeat_count": 0.0, - "routers_loss": 0.0015503648901358247, + "routers_loss": 0.001088051125407219, "skip_count": 0.0, "step": 2372, "text_loss": 0.6031543612480164 @@ -22551,13 +22551,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.091796875, + "grad_norm": 0.095703125, "learning_rate": 0.0009183068165127245, - "loss": 0.0127, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3828781.0, "repeat_count": 0.0, - "routers_loss": 0.00723480898886919, + "routers_loss": 0.006263940595090389, "skip_count": 1.0, "step": 2374, "text_loss": 0.6249601244926453 @@ -22570,13 +22570,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.06982421875, "learning_rate": 0.0009181371852644062, - "loss": 0.0139, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 3832507.0, "repeat_count": 1.0, - "routers_loss": 0.002053398173302412, + "routers_loss": 0.001987969037145376, "skip_count": 0.0, "step": 2376, "text_loss": 0.37972065806388855 @@ -22589,32 +22589,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.0908203125, "learning_rate": 0.0009179673937851299, "loss": 0.0158, "macro_f1": 0.6666666865348816, "num_tokens": 3835644.0, "repeat_count": 0.0, - "routers_loss": 0.007927518337965012, + "routers_loss": 0.007635094691067934, "skip_count": 1.0, "step": 2378, "text_loss": 0.46319663524627686 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, "learning_rate": 0.0009177974421399598, - "loss": 0.0144, - "macro_f1": 0.5555555820465088, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, "num_tokens": 3838700.0, "repeat_count": 0.0, - "routers_loss": 0.01924682781100273, + "routers_loss": 0.01617279462516308, "skip_count": 2.0, "step": 2380, "text_loss": 0.32141056656837463 @@ -22627,13 +22627,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046875, + "grad_norm": 0.056396484375, "learning_rate": 0.0009176273303940217, - "loss": 0.0106, + "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 3841953.0, "repeat_count": 0.0, - "routers_loss": 0.0021689811255782843, + "routers_loss": 0.0022273799404501915, "skip_count": 2.0, "step": 2382, "text_loss": 0.5908139944076538 @@ -22646,13 +22646,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.0615234375, "learning_rate": 0.0009174570586125026, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.32098767161369324, "num_tokens": 3845763.0, "repeat_count": 1.0, - "routers_loss": 0.03431013971567154, + "routers_loss": 0.030915161594748497, "skip_count": 0.0, "step": 2384, "text_loss": 0.41400137543678284 @@ -22665,13 +22665,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009172866268606513, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3848984.0, "repeat_count": 0.0, - "routers_loss": 0.008275258354842663, + "routers_loss": 0.010480951517820358, "skip_count": 2.0, "step": 2386, "text_loss": 0.2560874819755554 @@ -22684,13 +22684,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04736328125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009171160352037775, - "loss": 0.0121, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3852118.0, "repeat_count": 0.0, - "routers_loss": 0.007780806161463261, + "routers_loss": 0.00809961836785078, "skip_count": 1.0, "step": 2388, "text_loss": 0.28236693143844604 @@ -22709,7 +22709,7 @@ "macro_f1": 1.0, "num_tokens": 3855314.0, "repeat_count": 1.0, - "routers_loss": 0.00553786288946867, + "routers_loss": 0.005569872446358204, "skip_count": 1.0, "step": 2390, "text_loss": 0.4578137695789337 @@ -22722,13 +22722,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009167743724365073, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3858301.0, "repeat_count": 0.0, - "routers_loss": 0.004066115710884333, + "routers_loss": 0.0038610948249697685, "skip_count": 1.0, "step": 2392, "text_loss": 0.14082716405391693 @@ -22741,13 +22741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009166033014570368, - "loss": 0.0104, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3861296.0, "repeat_count": 0.0, - "routers_loss": 0.002403446938842535, + "routers_loss": 0.0017607157351449132, "skip_count": 0.0, "step": 2394, "text_loss": 0.384442001581192 @@ -22760,13 +22760,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009164320708343954, - "loss": 0.0137, + "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 3863985.0, "repeat_count": 2.0, - "routers_loss": 0.010212135501205921, + "routers_loss": 0.009627950377762318, "skip_count": 0.0, "step": 2396, "text_loss": 0.6969521045684814 @@ -22779,13 +22779,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009162606806341989, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 3866636.0, "repeat_count": 0.0, - "routers_loss": 0.007781816180795431, + "routers_loss": 0.006915586534887552, "skip_count": 0.0, "step": 2398, "text_loss": 0.48069697618484497 @@ -22798,32 +22798,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009160891309221242, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3870867.0, "repeat_count": 1.0, - "routers_loss": 0.0016227158484980464, + "routers_loss": 0.0013031222624704242, "skip_count": 0.0, "step": 2400, "text_loss": 0.3882075846195221 }, { "acc_repeat": 0.5, - "acc_skip": 1.0, - "avg_layers": 28.0, + "acc_skip": 0.0, + "avg_layers": 29.0, "epoch": 11.277076606985618, - "f1_execute": 0.9803921580314636, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, - "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "f1_skip": 0.0, + "grad_norm": 0.06640625, "learning_rate": 0.0009159174217639096, - "loss": 0.0114, - "macro_f1": 0.8823530077934265, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, "num_tokens": 3873663.0, "repeat_count": 2.0, - "routers_loss": 0.06490851938724518, + "routers_loss": 0.06621067970991135, "skip_count": 1.0, "step": 2402, "text_loss": 0.5740041136741638 @@ -22836,13 +22836,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.03662109375, "learning_rate": 0.0009157455532253547, - "loss": 0.0075, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3876788.0, "repeat_count": 1.0, - "routers_loss": 0.007105287164449692, + "routers_loss": 0.005957918707281351, "skip_count": 0.0, "step": 2404, "text_loss": 0.26025933027267456 @@ -22855,13 +22855,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.06787109375, + "grad_norm": 0.08642578125, "learning_rate": 0.0009155735253723191, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.9452888369560242, "num_tokens": 3879942.0, "repeat_count": 1.0, - "routers_loss": 0.03736003860831261, + "routers_loss": 0.039429809898138046, "skip_count": 4.0, "step": 2406, "text_loss": 1.1349908113479614 @@ -22874,13 +22874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.047607421875, "learning_rate": 0.0009154013382707251, - "loss": 0.011, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3882682.0, "repeat_count": 0.0, - "routers_loss": 0.0012925176415592432, + "routers_loss": 0.0012570557883009315, "skip_count": 0.0, "step": 2408, "text_loss": 0.5611135363578796 @@ -22895,11 +22895,11 @@ "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0009152289919865543, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3886425.0, "repeat_count": 0.0, - "routers_loss": 0.001746711554005742, + "routers_loss": 0.0017455556662753224, "skip_count": 0.0, "step": 2410, "text_loss": 0.7523751854896545 @@ -22912,13 +22912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.04052734375, "learning_rate": 0.0009150564865858506, - "loss": 0.0112, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3889273.0, "repeat_count": 0.0, - "routers_loss": 0.011005193926393986, + "routers_loss": 0.011178011074662209, "skip_count": 1.0, "step": 2412, "text_loss": 0.26942551136016846 @@ -22931,13 +22931,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009148838221347182, - "loss": 0.0102, + "loss": 0.0107, "macro_f1": 0.5934640765190125, "num_tokens": 3892199.0, "repeat_count": 3.0, - "routers_loss": 0.017795369029045105, + "routers_loss": 0.019628092646598816, "skip_count": 0.0, "step": 2414, "text_loss": 0.5492315888404846 @@ -22950,13 +22950,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.04541015625, "learning_rate": 0.0009147109986993225, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3895362.0, "repeat_count": 1.0, - "routers_loss": 0.011693861335515976, + "routers_loss": 0.012255983427166939, "skip_count": 0.0, "step": 2416, "text_loss": 0.23798216879367828 @@ -22969,13 +22969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009145380163458899, - "loss": 0.0177, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3898476.0, "repeat_count": 0.0, - "routers_loss": 0.007135285064578056, + "routers_loss": 0.007018954027444124, "skip_count": 0.0, "step": 2418, "text_loss": 0.1923145055770874 @@ -22988,13 +22988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.03369140625, "learning_rate": 0.0009143648751407074, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3901817.0, "repeat_count": 0.0, - "routers_loss": 0.0008607010240666568, + "routers_loss": 0.0008574824314564466, "skip_count": 0.0, "step": 2420, "text_loss": 0.4001806974411011 @@ -23007,13 +23007,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07861328125, + "grad_norm": 0.11328125, "learning_rate": 0.0009141915751501231, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 3905461.0, "repeat_count": 0.0, - "routers_loss": 0.015359465964138508, + "routers_loss": 0.01572350226342678, "skip_count": 2.0, "step": 2422, "text_loss": 0.19519129395484924 @@ -23026,13 +23026,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037353515625, "learning_rate": 0.0009140181164405458, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3908878.0, "repeat_count": 0.0, - "routers_loss": 0.00047823251225054264, + "routers_loss": 0.0005503420252352953, "skip_count": 0.0, "step": 2424, "text_loss": 0.6937088370323181 @@ -23047,11 +23047,11 @@ "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009138444990784454, - "loss": 0.0129, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3912053.0, "repeat_count": 0.0, - "routers_loss": 0.0070601715706288815, + "routers_loss": 0.007556677330285311, "skip_count": 0.0, "step": 2426, "text_loss": 0.35431069135665894 @@ -23064,13 +23064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.06201171875, "learning_rate": 0.000913670723130352, - "loss": 0.0123, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3915192.0, "repeat_count": 0.0, - "routers_loss": 0.0010537977796047926, + "routers_loss": 0.0013609991874545813, "skip_count": 0.0, "step": 2428, "text_loss": 0.5171207189559937 @@ -23083,13 +23083,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.050048828125, "learning_rate": 0.0009134967886628573, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 3917927.0, "repeat_count": 2.0, - "routers_loss": 0.012852456420660019, + "routers_loss": 0.010895746760070324, "skip_count": 2.0, "step": 2430, "text_loss": 0.2852934002876282 @@ -23102,13 +23102,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.062255859375, "learning_rate": 0.0009133226957426133, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.5492662787437439, "num_tokens": 3921460.0, "repeat_count": 2.0, - "routers_loss": 0.05307198315858841, + "routers_loss": 0.04196908697485924, "skip_count": 0.0, "step": 2432, "text_loss": 0.4864770770072937 @@ -23121,13 +23121,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009131484444363324, - "loss": 0.0154, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3924662.0, "repeat_count": 0.0, - "routers_loss": 0.004656757228076458, + "routers_loss": 0.004484197124838829, "skip_count": 0.0, "step": 2434, "text_loss": 0.7568684220314026 @@ -23140,13 +23140,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.05078125, "learning_rate": 0.0009129740348107882, - "loss": 0.0113, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3927337.0, "repeat_count": 0.0, - "routers_loss": 0.0042406003922224045, + "routers_loss": 0.004351360257714987, "skip_count": 2.0, "step": 2436, "text_loss": 0.5953161716461182 @@ -23159,13 +23159,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0517578125, + "grad_norm": 0.04736328125, "learning_rate": 0.0009127994669328151, - "loss": 0.0089, + "loss": 0.0085, "macro_f1": 0.6122449040412903, "num_tokens": 3930407.0, "repeat_count": 0.0, - "routers_loss": 0.018079286441206932, + "routers_loss": 0.01664198748767376, "skip_count": 4.0, "step": 2438, "text_loss": 0.5320524573326111 @@ -23178,13 +23178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0595703125, "learning_rate": 0.0009126247408693071, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3933184.0, "repeat_count": 0.0, - "routers_loss": 0.002266801195219159, + "routers_loss": 0.0017819046042859554, "skip_count": 1.0, "step": 2440, "text_loss": 0.6051273345947266 @@ -23197,13 +23197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.0009124498566872204, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3936620.0, "repeat_count": 0.0, - "routers_loss": 0.005790423136204481, + "routers_loss": 0.005519696045666933, "skip_count": 0.0, "step": 2442, "text_loss": 0.12987950444221497 @@ -23216,13 +23216,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052734375, + "grad_norm": 0.052490234375, "learning_rate": 0.0009122748144535704, - "loss": 0.011, + "loss": 0.0111, "macro_f1": 0.32098764181137085, "num_tokens": 3940010.0, "repeat_count": 0.0, - "routers_loss": 0.04591076448559761, + "routers_loss": 0.04543351009488106, "skip_count": 2.0, "step": 2444, "text_loss": 0.4642033576965332 @@ -23235,13 +23235,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.04296875, "learning_rate": 0.0009120996142354338, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3943135.0, "repeat_count": 0.0, - "routers_loss": 0.004969341680407524, + "routers_loss": 0.00550565542653203, "skip_count": 0.0, "step": 2446, "text_loss": 0.5697627067565918 @@ -23254,13 +23254,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0009119242560999477, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3946650.0, "repeat_count": 0.0, - "routers_loss": 0.00830315612256527, + "routers_loss": 0.008842485956847668, "skip_count": 0.0, "step": 2448, "text_loss": 0.17046524584293365 @@ -23273,13 +23273,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009117487401143095, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 3949470.0, "repeat_count": 1.0, - "routers_loss": 0.0059144929982721806, + "routers_loss": 0.005900127813220024, "skip_count": 0.0, "step": 2450, "text_loss": 0.37260866165161133 @@ -23292,13 +23292,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.035400390625, "learning_rate": 0.0009115730663457773, - "loss": 0.0132, + "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3952546.0, "repeat_count": 1.0, - "routers_loss": 0.0029762545600533485, + "routers_loss": 0.003409258322790265, "skip_count": 1.0, "step": 2452, "text_loss": 0.5308008193969727 @@ -23311,13 +23311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.05224609375, "learning_rate": 0.0009113972348616698, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 3955817.0, "repeat_count": 0.0, - "routers_loss": 0.011962058953940868, + "routers_loss": 0.010098597034811974, "skip_count": 1.0, "step": 2454, "text_loss": 0.39226648211479187 @@ -23330,13 +23330,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1640625, "learning_rate": 0.0009112212457293658, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 3958911.0, "repeat_count": 0.0, - "routers_loss": 0.07289884239435196, + "routers_loss": 0.08184818178415298, "skip_count": 0.0, "step": 2456, "text_loss": 0.45411455631256104 @@ -23349,13 +23349,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009110450990163047, - "loss": 0.0124, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3962584.0, "repeat_count": 0.0, - "routers_loss": 0.0009638209594413638, + "routers_loss": 0.0009352223132736981, "skip_count": 0.0, "step": 2458, "text_loss": 0.47292324900627136 @@ -23368,13 +23368,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0009108687947899863, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3965597.0, "repeat_count": 1.0, - "routers_loss": 0.008587516844272614, + "routers_loss": 0.008150188252329826, "skip_count": 2.0, "step": 2460, "text_loss": 0.33208340406417847 @@ -23387,13 +23387,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04150390625, + "grad_norm": 0.043212890625, "learning_rate": 0.0009106923331179707, - "loss": 0.0126, + "loss": 0.0125, "macro_f1": 0.5492662787437439, "num_tokens": 3968664.0, "repeat_count": 0.0, - "routers_loss": 0.05080332234501839, + "routers_loss": 0.050999004393815994, "skip_count": 2.0, "step": 2462, "text_loss": 0.2459995150566101 @@ -23406,13 +23406,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0693359375, "learning_rate": 0.0009105157140678782, - "loss": 0.0124, + "loss": 0.0126, "macro_f1": 0.6666666865348816, "num_tokens": 3971772.0, "repeat_count": 0.0, - "routers_loss": 0.007348654326051474, + "routers_loss": 0.006196586415171623, "skip_count": 1.0, "step": 2464, "text_loss": 0.23956991732120514 @@ -23425,13 +23425,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009103389377073896, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 3976224.0, "repeat_count": 0.0, - "routers_loss": 0.007161752786487341, + "routers_loss": 0.008181816898286343, "skip_count": 0.0, "step": 2466, "text_loss": 0.3235875070095062 @@ -23444,13 +23444,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.057373046875, "learning_rate": 0.0009101620041042462, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3978876.0, "repeat_count": 0.0, - "routers_loss": 0.0015090530505403876, + "routers_loss": 0.0015451472718268633, "skip_count": 0.0, "step": 2468, "text_loss": 0.4038759469985962 @@ -23463,13 +23463,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.09130859375, "learning_rate": 0.000909984913326249, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3272727429866791, "num_tokens": 3981992.0, "repeat_count": 0.0, - "routers_loss": 0.021420184522867203, + "routers_loss": 0.021785033866763115, "skip_count": 1.0, "step": 2470, "text_loss": 0.6346460580825806 @@ -23482,13 +23482,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0712890625, "learning_rate": 0.0009098076654412595, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3984560.0, "repeat_count": 0.0, - "routers_loss": 0.0010742908343672752, + "routers_loss": 0.0011462471447885036, "skip_count": 0.0, "step": 2472, "text_loss": 0.3449646532535553 @@ -23501,13 +23501,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05078125, + "grad_norm": 0.049560546875, "learning_rate": 0.0009096302605171996, - "loss": 0.011, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3987548.0, "repeat_count": 0.0, - "routers_loss": 0.0015209210105240345, + "routers_loss": 0.0014367027906700969, "skip_count": 0.0, "step": 2474, "text_loss": 0.5918350219726562 @@ -23520,13 +23520,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0478515625, "learning_rate": 0.0009094526986220513, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3990727.0, "repeat_count": 0.0, - "routers_loss": 0.0008761848439462483, + "routers_loss": 0.0008977655088528991, "skip_count": 0.0, "step": 2476, "text_loss": 0.463350385427475 @@ -23539,13 +23539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.049072265625, "learning_rate": 0.0009092749798238563, - "loss": 0.0146, + "loss": 0.015, "macro_f1": 0.3272727429866791, "num_tokens": 3993757.0, "repeat_count": 1.0, - "routers_loss": 0.01623794063925743, + "routers_loss": 0.016712551936507225, "skip_count": 0.0, "step": 2478, "text_loss": 0.5621229410171509 @@ -23558,13 +23558,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.06640625, "learning_rate": 0.000909097104190717, - "loss": 0.0174, + "loss": 0.0172, "macro_f1": 0.32098764181137085, "num_tokens": 3997259.0, "repeat_count": 0.0, - "routers_loss": 0.04170118644833565, + "routers_loss": 0.04134179651737213, "skip_count": 2.0, "step": 2480, "text_loss": 0.375476598739624 @@ -23577,32 +23577,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.044677734375, "learning_rate": 0.0009089190717907956, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4000563.0, "repeat_count": 0.0, - "routers_loss": 0.003591755870729685, + "routers_loss": 0.003462378401309252, "skip_count": 0.0, "step": 2482, "text_loss": 0.5553798675537109 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.66216612855885, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0693359375, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, "learning_rate": 0.0009087408826923146, - "loss": 0.0185, - "macro_f1": 0.5492662787437439, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, "num_tokens": 4004065.0, "repeat_count": 0.0, - "routers_loss": 0.009214848279953003, + "routers_loss": 0.008057428523898125, "skip_count": 2.0, "step": 2484, "text_loss": 0.4329465329647064 @@ -23615,13 +23615,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.050048828125, "learning_rate": 0.0009085625369635564, - "loss": 0.0111, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4007119.0, "repeat_count": 0.0, - "routers_loss": 0.0059350160881876945, + "routers_loss": 0.005759050603955984, "skip_count": 0.0, "step": 2486, "text_loss": 0.501268744468689 @@ -23634,13 +23634,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1240234375, "learning_rate": 0.0009083840346728631, - "loss": 0.0118, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 4010547.0, "repeat_count": 1.0, - "routers_loss": 0.019803427159786224, + "routers_loss": 0.020763102918863297, "skip_count": 0.0, "step": 2488, "text_loss": 0.480196475982666 @@ -23653,13 +23653,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05078125, "learning_rate": 0.0009082053758886374, - "loss": 0.0118, + "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 4014600.0, "repeat_count": 0.0, - "routers_loss": 0.006243673153221607, + "routers_loss": 0.005801836494356394, "skip_count": 1.0, "step": 2490, "text_loss": 0.18249782919883728 @@ -23672,13 +23672,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009080265606793416, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 4017964.0, "repeat_count": 1.0, - "routers_loss": 0.003960726782679558, + "routers_loss": 0.004226063843816519, "skip_count": 1.0, "step": 2492, "text_loss": 0.6573076248168945 @@ -23691,13 +23691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.049072265625, "learning_rate": 0.000907847589113498, - "loss": 0.0127, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 4020694.0, "repeat_count": 0.0, - "routers_loss": 0.004959117621183395, + "routers_loss": 0.004281101748347282, "skip_count": 2.0, "step": 2494, "text_loss": 0.3944586217403412 @@ -23710,13 +23710,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.000907668461259689, - "loss": 0.0157, + "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 4023757.0, "repeat_count": 0.0, - "routers_loss": 0.009721433743834496, + "routers_loss": 0.008786370046436787, "skip_count": 1.0, "step": 2496, "text_loss": 0.6452898979187012 @@ -23729,13 +23729,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0693359375, "learning_rate": 0.0009074891771865566, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4026601.0, "repeat_count": 0.0, - "routers_loss": 0.00491701066493988, + "routers_loss": 0.005209595896303654, "skip_count": 0.0, "step": 2498, "text_loss": 0.9633619785308838 @@ -23748,13 +23748,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.03759765625, "learning_rate": 0.0009073097369628028, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 1.0, "num_tokens": 4030321.0, "repeat_count": 3.0, - "routers_loss": 0.009832080453634262, + "routers_loss": 0.00860709697008133, "skip_count": 1.0, "step": 2500, "text_loss": 0.48566827178001404 @@ -23767,13 +23767,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.04443359375, "learning_rate": 0.0009071301406571893, - "loss": 0.0137, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4033234.0, "repeat_count": 0.0, - "routers_loss": 0.003301833290606737, + "routers_loss": 0.0035277456045150757, "skip_count": 0.0, "step": 2502, "text_loss": 0.3771554231643677 @@ -23786,13 +23786,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.000906950388338538, - "loss": 0.0134, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 4036417.0, "repeat_count": 0.0, - "routers_loss": 0.001580960932187736, + "routers_loss": 0.0013424850767478347, "skip_count": 0.0, "step": 2504, "text_loss": 0.8962806463241577 @@ -23805,13 +23805,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009067704800757301, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4039564.0, "repeat_count": 0.0, - "routers_loss": 0.0011505817528814077, + "routers_loss": 0.0010423909407109022, "skip_count": 0.0, "step": 2506, "text_loss": 0.43170279264450073 @@ -23824,13 +23824,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.000906590415937707, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 4043212.0, "repeat_count": 0.0, - "routers_loss": 0.023224346339702606, + "routers_loss": 0.021780289709568024, "skip_count": 1.0, "step": 2508, "text_loss": 0.41495826840400696 @@ -23843,13 +23843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.0341796875, "learning_rate": 0.0009064101959934696, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4046687.0, "repeat_count": 0.0, - "routers_loss": 0.007955167442560196, + "routers_loss": 0.007261929102241993, "skip_count": 1.0, "step": 2510, "text_loss": 0.21821187436580658 @@ -23862,13 +23862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.057861328125, "learning_rate": 0.0009062298203120783, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4050735.0, "repeat_count": 0.0, - "routers_loss": 0.006164440419524908, + "routers_loss": 0.007447180338203907, "skip_count": 2.0, "step": 2512, "text_loss": 0.1818767935037613 @@ -23881,13 +23881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.06494140625, "learning_rate": 0.0009060492889626535, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3272727429866791, "num_tokens": 4054426.0, "repeat_count": 1.0, - "routers_loss": 0.0713663101196289, + "routers_loss": 0.0718490406870842, "skip_count": 0.0, "step": 2514, "text_loss": 0.22798970341682434 @@ -23900,13 +23900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.099609375, "learning_rate": 0.0009058686020143753, - "loss": 0.0182, + "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 4057615.0, "repeat_count": 0.0, - "routers_loss": 0.0052308146841824055, + "routers_loss": 0.0052676633931696415, "skip_count": 0.0, "step": 2516, "text_loss": 0.1712338626384735 @@ -23919,13 +23919,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.0380859375, "learning_rate": 0.0009056877595364832, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 4060338.0, "repeat_count": 0.0, - "routers_loss": 0.0020465939305722713, + "routers_loss": 0.0018052728846669197, "skip_count": 0.0, "step": 2518, "text_loss": 0.6811438798904419 @@ -23938,13 +23938,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.083984375, "learning_rate": 0.0009055067615982761, - "loss": 0.0114, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4062887.0, "repeat_count": 0.0, - "routers_loss": 0.0008663221378810704, + "routers_loss": 0.0009029926732182503, "skip_count": 0.0, "step": 2520, "text_loss": 0.5480356812477112 @@ -23957,13 +23957,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.051025390625, "learning_rate": 0.0009053256082691133, - "loss": 0.0104, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 4065357.0, "repeat_count": 0.0, - "routers_loss": 0.0026889131404459476, + "routers_loss": 0.0027515271212905645, "skip_count": 0.0, "step": 2522, "text_loss": 0.5234101414680481 @@ -23978,11 +23978,11 @@ "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009051442996184127, - "loss": 0.0181, + "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 4068111.0, "repeat_count": 0.0, - "routers_loss": 0.002255887258797884, + "routers_loss": 0.002199822571128607, "skip_count": 0.0, "step": 2524, "text_loss": 0.2418575882911682 @@ -23995,13 +23995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060546875, + "grad_norm": 0.0625, "learning_rate": 0.0009049628357156521, - "loss": 0.0144, + "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 4071284.0, "repeat_count": 0.0, - "routers_loss": 0.005672316066920757, + "routers_loss": 0.006303096655756235, "skip_count": 2.0, "step": 2526, "text_loss": 0.7948065996170044 @@ -24014,13 +24014,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037841796875, "learning_rate": 0.000904781216630369, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 4074750.0, "repeat_count": 1.0, - "routers_loss": 0.017167411744594574, + "routers_loss": 0.01791904680430889, "skip_count": 2.0, "step": 2528, "text_loss": 0.809726357460022 @@ -24033,13 +24033,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009045994424321602, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4078617.0, "repeat_count": 2.0, - "routers_loss": 0.019105618819594383, + "routers_loss": 0.016553178429603577, "skip_count": 2.0, "step": 2530, "text_loss": 0.8755000829696655 @@ -24052,13 +24052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.061767578125, "learning_rate": 0.0009044175131906817, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 4080936.0, "repeat_count": 0.0, - "routers_loss": 0.007993129082024097, + "routers_loss": 0.00884837657213211, "skip_count": 0.0, "step": 2532, "text_loss": 0.795871913433075 @@ -24071,13 +24071,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.05029296875, "learning_rate": 0.0009042354289756491, - "loss": 0.0124, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4084459.0, "repeat_count": 0.0, - "routers_loss": 0.0024954001419246197, + "routers_loss": 0.0024387789890170097, "skip_count": 0.0, "step": 2534, "text_loss": 0.18875400722026825 @@ -24090,13 +24090,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0625, "learning_rate": 0.0009040531898568379, - "loss": 0.0169, + "loss": 0.0171, "macro_f1": 0.3333333432674408, "num_tokens": 4088464.0, "repeat_count": 0.0, - "routers_loss": 0.004360117018222809, + "routers_loss": 0.00491489190608263, "skip_count": 0.0, "step": 2536, "text_loss": 0.334369033575058 @@ -24109,13 +24109,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.091796875, "learning_rate": 0.000903870795904082, - "loss": 0.0142, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 4091659.0, "repeat_count": 0.0, - "routers_loss": 0.00429064966738224, + "routers_loss": 0.004592662677168846, "skip_count": 2.0, "step": 2538, "text_loss": 0.21298295259475708 @@ -24130,11 +24130,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.000903688247187275, - "loss": 0.0136, + "loss": 0.0137, "macro_f1": 0.5492662787437439, "num_tokens": 4095496.0, "repeat_count": 0.0, - "routers_loss": 0.0132954316213727, + "routers_loss": 0.011647242121398449, "skip_count": 2.0, "step": 2540, "text_loss": 0.2985081672668457 @@ -24147,13 +24147,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009035055437763704, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4098663.0, "repeat_count": 0.0, - "routers_loss": 0.002104961546137929, + "routers_loss": 0.0021238960325717926, "skip_count": 0.0, "step": 2542, "text_loss": 0.35359489917755127 @@ -24166,13 +24166,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.05859375, "learning_rate": 0.0009033226857413803, - "loss": 0.0167, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 4101588.0, "repeat_count": 1.0, - "routers_loss": 0.002973714144900441, + "routers_loss": 0.0024701557122170925, "skip_count": 0.0, "step": 2544, "text_loss": 1.1577601432800293 @@ -24185,13 +24185,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.080078125, "learning_rate": 0.000903139673152376, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4104643.0, "repeat_count": 0.0, - "routers_loss": 0.002359170001000166, + "routers_loss": 0.002499542199075222, "skip_count": 0.0, "step": 2546, "text_loss": 1.0173401832580566 @@ -24204,13 +24204,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.059814453125, "learning_rate": 0.0009029565060794885, - "loss": 0.0168, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 4109247.0, "repeat_count": 0.0, - "routers_loss": 0.0033595687709748745, + "routers_loss": 0.0034200598020106554, "skip_count": 0.0, "step": 2548, "text_loss": 0.5690504312515259 @@ -24223,13 +24223,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07421875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009027731845929079, "loss": 0.0155, "macro_f1": 0.8823530077934265, "num_tokens": 4112597.0, "repeat_count": 1.0, - "routers_loss": 0.015323673374950886, + "routers_loss": 0.015981333330273628, "skip_count": 1.0, "step": 2550, "text_loss": 0.294549822807312 @@ -24242,13 +24242,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.043212890625, + "grad_norm": 0.06103515625, "learning_rate": 0.0009025897087628829, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 4115844.0, "repeat_count": 0.0, - "routers_loss": 0.02122018299996853, + "routers_loss": 0.02606951631605625, "skip_count": 2.0, "step": 2552, "text_loss": 0.22692419588565826 @@ -24261,13 +24261,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.080078125, "learning_rate": 0.0009024060786597222, "loss": 0.0202, "macro_f1": 0.3333333432674408, "num_tokens": 4118634.0, "repeat_count": 0.0, - "routers_loss": 0.0010765352053567767, + "routers_loss": 0.001026194542646408, "skip_count": 0.0, "step": 2554, "text_loss": 0.6807059645652771 @@ -24280,13 +24280,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.04638671875, "learning_rate": 0.000902222294353793, - "loss": 0.0128, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4122024.0, "repeat_count": 0.0, - "routers_loss": 0.0017301233019679785, + "routers_loss": 0.001974924933165312, "skip_count": 0.0, "step": 2556, "text_loss": 0.7373668551445007 @@ -24299,13 +24299,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.04833984375, "learning_rate": 0.0009020383559155219, - "loss": 0.0056, + "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4124803.0, "repeat_count": 1.0, - "routers_loss": 0.004307204391807318, + "routers_loss": 0.004662613850086927, "skip_count": 2.0, "step": 2558, "text_loss": 0.21808166801929474 @@ -24318,13 +24318,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.0263671875, "learning_rate": 0.0009018542634153943, - "loss": 0.0064, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 4127680.0, "repeat_count": 0.0, - "routers_loss": 0.0073805381543934345, + "routers_loss": 0.006881687790155411, "skip_count": 0.0, "step": 2560, "text_loss": 0.25192978978157043 @@ -24339,11 +24339,11 @@ "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009016700169239551, - "loss": 0.0108, + "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 4130431.0, "repeat_count": 1.0, - "routers_loss": 0.005493874195963144, + "routers_loss": 0.005977808032184839, "skip_count": 1.0, "step": 2562, "text_loss": 0.4700816869735718 @@ -24356,13 +24356,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.068359375, "learning_rate": 0.0009014856165118075, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 4133535.0, "repeat_count": 0.0, - "routers_loss": 0.006889877840876579, + "routers_loss": 0.007005698047578335, "skip_count": 1.0, "step": 2564, "text_loss": 0.6558199524879456 @@ -24375,13 +24375,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.030517578125, "learning_rate": 0.0009013010622496144, - "loss": 0.009, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4136534.0, "repeat_count": 0.0, - "routers_loss": 0.008495541289448738, + "routers_loss": 0.007262171246111393, "skip_count": 0.0, "step": 2566, "text_loss": 0.2565421462059021 @@ -24394,13 +24394,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.044921875, + "grad_norm": 0.043212890625, "learning_rate": 0.0009011163542080971, - "loss": 0.0089, + "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 4139762.0, "repeat_count": 0.0, - "routers_loss": 0.05929862707853317, + "routers_loss": 0.05431923270225525, "skip_count": 3.0, "step": 2568, "text_loss": 0.19896510243415833 @@ -24413,13 +24413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.026611328125, "learning_rate": 0.0009009314924580363, - "loss": 0.0086, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4143398.0, "repeat_count": 0.0, - "routers_loss": 0.0033934004604816437, + "routers_loss": 0.003667369019240141, "skip_count": 0.0, "step": 2570, "text_loss": 0.6581419110298157 @@ -24432,13 +24432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009007464770702712, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4146248.0, "repeat_count": 0.0, - "routers_loss": 0.0012826769379898906, + "routers_loss": 0.00132099783513695, "skip_count": 0.0, "step": 2572, "text_loss": 0.5316711068153381 @@ -24451,13 +24451,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038818359375, "learning_rate": 0.0009005613081157002, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4149455.0, "repeat_count": 0.0, - "routers_loss": 0.0019460092298686504, + "routers_loss": 0.0020061524119228125, "skip_count": 0.0, "step": 2574, "text_loss": 0.5400773882865906 @@ -24470,13 +24470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.064453125, + "grad_norm": 0.05517578125, "learning_rate": 0.0009003759856652802, - "loss": 0.0112, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4152774.0, "repeat_count": 0.0, - "routers_loss": 0.004493138287216425, + "routers_loss": 0.002621434163302183, "skip_count": 1.0, "step": 2576, "text_loss": 0.3672606945037842 @@ -24489,13 +24489,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009001905097900273, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4155835.0, "repeat_count": 0.0, - "routers_loss": 0.005607665050774813, + "routers_loss": 0.005290219560265541, "skip_count": 0.0, "step": 2578, "text_loss": 0.8159038424491882 @@ -24508,13 +24508,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.040771484375, "learning_rate": 0.0009000048805610161, - "loss": 0.0123, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 4158874.0, "repeat_count": 0.0, - "routers_loss": 0.0015080278972163796, + "routers_loss": 0.0013576085912063718, "skip_count": 0.0, "step": 2580, "text_loss": 0.5518951416015625 @@ -24529,11 +24529,11 @@ "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.00089981909804938, - "loss": 0.0142, + "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 4162076.0, "repeat_count": 0.0, - "routers_loss": 0.0022276053205132484, + "routers_loss": 0.0021483441814780235, "skip_count": 0.0, "step": 2582, "text_loss": 0.43552228808403015 @@ -24546,13 +24546,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.07421875, + "grad_norm": 0.068359375, "learning_rate": 0.0008996331623263114, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.7795917987823486, "num_tokens": 4165041.0, "repeat_count": 1.0, - "routers_loss": 0.0499282106757164, + "routers_loss": 0.0544300302863121, "skip_count": 4.0, "step": 2584, "text_loss": 0.24812501668930054 @@ -24565,13 +24565,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.047607421875, "learning_rate": 0.0008994470734630611, - "loss": 0.01, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4168290.0, "repeat_count": 0.0, - "routers_loss": 0.0016360745066776872, + "routers_loss": 0.0017150711501017213, "skip_count": 0.0, "step": 2586, "text_loss": 0.6392097473144531 @@ -24584,32 +24584,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008992608315309388, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4171310.0, "repeat_count": 0.0, - "routers_loss": 0.0037772543728351593, + "routers_loss": 0.0046473173424601555, "skip_count": 2.0, "step": 2588, "text_loss": 0.6534156799316406 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 12.15967126504256, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, "learning_rate": 0.0008990744366013125, - "loss": 0.0104, - "macro_f1": 0.6538461446762085, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, "num_tokens": 4174042.0, "repeat_count": 2.0, - "routers_loss": 0.05992122367024422, + "routers_loss": 0.060913100838661194, "skip_count": 1.0, "step": 2590, "text_loss": 0.5365690588951111 @@ -24622,13 +24622,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.05859375, + "grad_norm": 0.055419921875, "learning_rate": 0.0008988878887456093, "loss": 0.0118, "macro_f1": 0.6051587462425232, "num_tokens": 4177666.0, "repeat_count": 1.0, - "routers_loss": 0.0679154023528099, + "routers_loss": 0.06268956512212753, "skip_count": 4.0, "step": 2592, "text_loss": 0.226226806640625 @@ -24643,11 +24643,11 @@ "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008987011880353149, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.32098764181137085, "num_tokens": 4180490.0, "repeat_count": 0.0, - "routers_loss": 0.03284052759408951, + "routers_loss": 0.030141465365886688, "skip_count": 2.0, "step": 2594, "text_loss": 0.2581401765346527 @@ -24660,13 +24660,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.051513671875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008985143345419729, - "loss": 0.0087, + "loss": 0.0082, "macro_f1": 0.5492662787437439, "num_tokens": 4183300.0, "repeat_count": 0.0, - "routers_loss": 0.01971421390771866, + "routers_loss": 0.018745863810181618, "skip_count": 2.0, "step": 2596, "text_loss": 0.7778542637825012 @@ -24679,13 +24679,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0703125, + "grad_norm": 0.064453125, "learning_rate": 0.0008983273283371862, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.5492662787437439, "num_tokens": 4186535.0, "repeat_count": 0.0, - "routers_loss": 0.028065117076039314, + "routers_loss": 0.026792079210281372, "skip_count": 2.0, "step": 2598, "text_loss": 0.34700271487236023 @@ -24698,13 +24698,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.048828125, "learning_rate": 0.0008981401694926159, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4189082.0, "repeat_count": 0.0, - "routers_loss": 0.00166845612693578, + "routers_loss": 0.001914160675369203, "skip_count": 0.0, "step": 2600, "text_loss": 0.6879339218139648 @@ -24717,13 +24717,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.06396484375, "learning_rate": 0.0008979528580799815, - "loss": 0.0138, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 4192330.0, "repeat_count": 0.0, - "routers_loss": 0.007527270819991827, + "routers_loss": 0.007978348061442375, "skip_count": 2.0, "step": 2602, "text_loss": 0.3524550497531891 @@ -24736,13 +24736,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008977653941710613, - "loss": 0.0137, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4196117.0, "repeat_count": 2.0, - "routers_loss": 0.00412185862660408, + "routers_loss": 0.0035376469604671, "skip_count": 0.0, "step": 2604, "text_loss": 0.42356348037719727 @@ -24755,13 +24755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.05810546875, "learning_rate": 0.0008975777778376916, - "loss": 0.0157, + "loss": 0.0156, "macro_f1": 0.6666666865348816, "num_tokens": 4200423.0, "repeat_count": 0.0, - "routers_loss": 0.007787751499563456, + "routers_loss": 0.008262477815151215, "skip_count": 1.0, "step": 2606, "text_loss": 0.5272893905639648 @@ -24774,13 +24774,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0732421875, "learning_rate": 0.0008973900091517675, "loss": 0.0114, "macro_f1": 0.3272727429866791, "num_tokens": 4203257.0, "repeat_count": 0.0, - "routers_loss": 0.024111779406666756, + "routers_loss": 0.022957922890782356, "skip_count": 1.0, "step": 2608, "text_loss": 0.2713734805583954 @@ -24793,13 +24793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.043701171875, "learning_rate": 0.000897202088185242, - "loss": 0.0091, + "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4206243.0, "repeat_count": 0.0, - "routers_loss": 0.0057326615788042545, + "routers_loss": 0.006623407825827599, "skip_count": 2.0, "step": 2610, "text_loss": 0.5920525789260864 @@ -24812,13 +24812,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0517578125, "learning_rate": 0.0008970140150101274, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4209264.0, "repeat_count": 0.0, - "routers_loss": 0.0008877563523128629, + "routers_loss": 0.0008602747693657875, "skip_count": 0.0, "step": 2612, "text_loss": 0.33421996235847473 @@ -24831,13 +24831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.030517578125, "learning_rate": 0.0008968257896984932, - "loss": 0.0067, + "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4212058.0, "repeat_count": 0.0, - "routers_loss": 0.0039034869987517595, + "routers_loss": 0.0024653903674334288, "skip_count": 1.0, "step": 2614, "text_loss": 0.37923356890678406 @@ -24850,13 +24850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06298828125, "learning_rate": 0.0008966374123224677, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4214929.0, "repeat_count": 0.0, - "routers_loss": 0.01140254084020853, + "routers_loss": 0.010878405533730984, "skip_count": 0.0, "step": 2616, "text_loss": 0.4350503981113434 @@ -24869,13 +24869,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.0303955078125, "learning_rate": 0.0008964488829542376, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4219170.0, "repeat_count": 0.0, - "routers_loss": 0.028559349477291107, + "routers_loss": 0.02864212542772293, "skip_count": 1.0, "step": 2618, "text_loss": 0.26250728964805603 @@ -24888,13 +24888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061279296875, + "grad_norm": 0.062255859375, "learning_rate": 0.0008962602016660478, - "loss": 0.0097, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4222077.0, "repeat_count": 0.0, - "routers_loss": 0.010525460354983807, + "routers_loss": 0.010444172658026218, "skip_count": 2.0, "step": 2620, "text_loss": 0.4718937575817108 @@ -24907,13 +24907,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008960713685302011, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4225383.0, "repeat_count": 0.0, - "routers_loss": 0.005284689832478762, + "routers_loss": 0.006409442983567715, "skip_count": 1.0, "step": 2622, "text_loss": 0.30420538783073425 @@ -24926,13 +24926,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.02978515625, "learning_rate": 0.0008958823836190588, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 4228349.0, "repeat_count": 0.0, - "routers_loss": 0.011040215380489826, + "routers_loss": 0.009996986016631126, "skip_count": 1.0, "step": 2624, "text_loss": 0.5392362475395203 @@ -24945,13 +24945,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.031494140625, "learning_rate": 0.0008956932470050404, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 4232007.0, "repeat_count": 0.0, - "routers_loss": 0.0014406041009351611, + "routers_loss": 0.0014383369125425816, "skip_count": 0.0, "step": 2626, "text_loss": 0.7112401127815247 @@ -24964,13 +24964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.072265625, + "grad_norm": 0.058349609375, "learning_rate": 0.0008955039587606233, - "loss": 0.0111, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4235122.0, "repeat_count": 0.0, - "routers_loss": 0.007106760982424021, + "routers_loss": 0.00781513936817646, "skip_count": 3.0, "step": 2628, "text_loss": 0.17802883684635162 @@ -24983,13 +24983,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0400390625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008953145189583429, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.542222261428833, "num_tokens": 4238248.0, "repeat_count": 0.0, - "routers_loss": 0.06423533707857132, + "routers_loss": 0.062252625823020935, "skip_count": 4.0, "step": 2630, "text_loss": 0.5551572442054749 @@ -25002,13 +25002,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0008951249276707933, - "loss": 0.012, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4241042.0, "repeat_count": 0.0, - "routers_loss": 0.0010294591775164008, + "routers_loss": 0.0011421777307987213, "skip_count": 0.0, "step": 2632, "text_loss": 0.7092233896255493 @@ -25021,13 +25021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008949351849706261, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4243939.0, "repeat_count": 0.0, - "routers_loss": 0.0032732547260820866, + "routers_loss": 0.0032689040526747704, "skip_count": 0.0, "step": 2634, "text_loss": 0.19925718009471893 @@ -25040,13 +25040,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.033935546875, "learning_rate": 0.0008947452909305509, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4247535.0, "repeat_count": 1.0, - "routers_loss": 0.0021109411027282476, + "routers_loss": 0.002066014800220728, "skip_count": 0.0, "step": 2636, "text_loss": 0.5249715447425842 @@ -25059,13 +25059,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11279296875, + "grad_norm": 0.09326171875, "learning_rate": 0.0008945552456233356, "loss": 0.0169, "macro_f1": 0.8820862174034119, "num_tokens": 4251441.0, "repeat_count": 2.0, - "routers_loss": 0.029545020312070847, + "routers_loss": 0.029332537204027176, "skip_count": 2.0, "step": 2638, "text_loss": 0.19229578971862793 @@ -25078,13 +25078,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.078125, "learning_rate": 0.0008943650491218058, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4254314.0, "repeat_count": 0.0, - "routers_loss": 0.0075805820524692535, + "routers_loss": 0.0075911120511591434, "skip_count": 0.0, "step": 2640, "text_loss": 0.27059751749038696 @@ -25097,13 +25097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008941747014988453, - "loss": 0.0155, + "loss": 0.0156, "macro_f1": 0.3333333432674408, "num_tokens": 4257442.0, "repeat_count": 0.0, - "routers_loss": 0.008832095190882683, + "routers_loss": 0.009030844084918499, "skip_count": 0.0, "step": 2642, "text_loss": 0.36747801303863525 @@ -25116,13 +25116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.080078125, + "grad_norm": 0.123046875, "learning_rate": 0.0008939842028273956, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4260386.0, "repeat_count": 0.0, - "routers_loss": 0.008952614851295948, + "routers_loss": 0.007844001986086369, "skip_count": 1.0, "step": 2644, "text_loss": 0.6397647857666016 @@ -25135,13 +25135,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.0283203125, "learning_rate": 0.0008937935531804562, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4263516.0, "repeat_count": 0.0, - "routers_loss": 0.0017659157747402787, + "routers_loss": 0.0018789108144119382, "skip_count": 0.0, "step": 2646, "text_loss": 0.4795534908771515 @@ -25154,13 +25154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.06494140625, "learning_rate": 0.0008936027526310844, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3272727429866791, "num_tokens": 4266744.0, "repeat_count": 0.0, - "routers_loss": 0.03944230079650879, + "routers_loss": 0.0348590686917305, "skip_count": 1.0, "step": 2648, "text_loss": 0.27691999077796936 @@ -25173,13 +25173,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.07275390625, "learning_rate": 0.000893411801252395, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4269766.0, "repeat_count": 0.0, - "routers_loss": 0.0037144431844353676, + "routers_loss": 0.004543309565633535, "skip_count": 1.0, "step": 2650, "text_loss": 0.18867231905460358 @@ -25192,13 +25192,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0008932206991175615, - "loss": 0.0143, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 4273513.0, "repeat_count": 0.0, - "routers_loss": 0.003659905167296529, + "routers_loss": 0.0035277456045150757, "skip_count": 1.0, "step": 2652, "text_loss": 0.45613357424736023 @@ -25211,13 +25211,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.055908203125, "learning_rate": 0.0008930294462998143, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4276878.0, "repeat_count": 1.0, - "routers_loss": 0.011676746420562267, + "routers_loss": 0.011337592266499996, "skip_count": 0.0, "step": 2654, "text_loss": 0.24733254313468933 @@ -25230,13 +25230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0008928380428724419, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4279915.0, "repeat_count": 0.0, - "routers_loss": 0.000998969655483961, + "routers_loss": 0.0010295971296727657, "skip_count": 1.0, "step": 2656, "text_loss": 0.41722849011421204 @@ -25249,13 +25249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.053955078125, "learning_rate": 0.0008926464889087903, - "loss": 0.0109, + "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4282888.0, "repeat_count": 0.0, - "routers_loss": 0.0016260759439319372, + "routers_loss": 0.0017198545392602682, "skip_count": 2.0, "step": 2658, "text_loss": 0.738322377204895 @@ -25268,13 +25268,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.068359375, "learning_rate": 0.0008924547844822634, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4285805.0, "repeat_count": 0.0, - "routers_loss": 0.0010900370543822646, + "routers_loss": 0.001339946174994111, "skip_count": 0.0, "step": 2660, "text_loss": 0.4802379906177521 @@ -25287,13 +25287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.05322265625, "learning_rate": 0.000892262929666323, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4290282.0, "repeat_count": 0.0, - "routers_loss": 0.002275131642818451, + "routers_loss": 0.0022340165451169014, "skip_count": 0.0, "step": 2662, "text_loss": 0.6503544449806213 @@ -25306,13 +25306,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.03662109375, "learning_rate": 0.0008920709245344878, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4294106.0, "repeat_count": 0.0, - "routers_loss": 0.00575100164860487, + "routers_loss": 0.005288850050419569, "skip_count": 1.0, "step": 2664, "text_loss": 0.12312037497758865 @@ -25325,13 +25325,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.041259765625, "learning_rate": 0.0008918787691603347, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 4298013.0, "repeat_count": 0.0, - "routers_loss": 0.004139711149036884, + "routers_loss": 0.004259659443050623, "skip_count": 1.0, "step": 2666, "text_loss": 0.3070000112056732 @@ -25344,13 +25344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04052734375, "learning_rate": 0.000891686463617498, - "loss": 0.0072, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4300799.0, "repeat_count": 0.0, - "routers_loss": 0.008856390602886677, + "routers_loss": 0.009489355608820915, "skip_count": 1.0, "step": 2668, "text_loss": 0.18535588681697845 @@ -25363,13 +25363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008914940079796696, - "loss": 0.0116, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4304641.0, "repeat_count": 0.0, - "routers_loss": 0.002438562922179699, + "routers_loss": 0.0025417013093829155, "skip_count": 0.0, "step": 2670, "text_loss": 0.482585072517395 @@ -25382,13 +25382,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008913014023205988, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4307462.0, "repeat_count": 0.0, - "routers_loss": 0.006435772404074669, + "routers_loss": 0.006371749565005302, "skip_count": 0.0, "step": 2672, "text_loss": 0.7064456939697266 @@ -25401,13 +25401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.039306640625, "learning_rate": 0.0008911086467140925, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4310396.0, "repeat_count": 0.0, - "routers_loss": 0.002773779444396496, + "routers_loss": 0.0027512952219694853, "skip_count": 0.0, "step": 2674, "text_loss": 0.23532851040363312 @@ -25420,13 +25420,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05712890625, "learning_rate": 0.000890915741234015, - "loss": 0.0135, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 4314781.0, "repeat_count": 0.0, - "routers_loss": 0.00862761028110981, + "routers_loss": 0.008253013715147972, "skip_count": 1.0, "step": 2676, "text_loss": 0.30950358510017395 @@ -25439,13 +25439,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.03173828125, "learning_rate": 0.0008907226859542879, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4317988.0, "repeat_count": 0.0, - "routers_loss": 0.005587176885455847, + "routers_loss": 0.005409995559602976, "skip_count": 2.0, "step": 2678, "text_loss": 0.4930732846260071 @@ -25458,13 +25458,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.060546875, "learning_rate": 0.0008905294809488907, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 4321014.0, "repeat_count": 1.0, - "routers_loss": 0.0033104203175753355, + "routers_loss": 0.0029942214023321867, "skip_count": 1.0, "step": 2680, "text_loss": 0.6224040389060974 @@ -25477,13 +25477,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08203125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008903361262918595, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4324268.0, "repeat_count": 0.0, - "routers_loss": 0.008205405436456203, + "routers_loss": 0.008411120623350143, "skip_count": 1.0, "step": 2682, "text_loss": 0.16296671330928802 @@ -25496,13 +25496,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052734375, + "grad_norm": 0.05126953125, "learning_rate": 0.0008901426220572884, - "loss": 0.0142, + "loss": 0.0138, "macro_f1": 1.0, "num_tokens": 4327494.0, "repeat_count": 2.0, - "routers_loss": 0.007884894497692585, + "routers_loss": 0.01039006095379591, "skip_count": 4.0, "step": 2684, "text_loss": 0.43866512179374695 @@ -25515,13 +25515,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.060791015625, "learning_rate": 0.0008899489683193286, - "loss": 0.011, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4330936.0, "repeat_count": 0.0, - "routers_loss": 0.0009336905204690993, + "routers_loss": 0.0009329111780971289, "skip_count": 0.0, "step": 2686, "text_loss": 0.44250962138175964 @@ -25534,13 +25534,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008897551651521885, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4334123.0, "repeat_count": 0.0, - "routers_loss": 0.0033622782211750746, + "routers_loss": 0.003197216661646962, "skip_count": 0.0, "step": 2688, "text_loss": 0.48313501477241516 @@ -25553,13 +25553,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.09716796875, "learning_rate": 0.0008895612126301339, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 4337610.0, "repeat_count": 0.0, - "routers_loss": 0.0034563415683805943, + "routers_loss": 0.0033548236824572086, "skip_count": 0.0, "step": 2690, "text_loss": 0.4715327322483063 @@ -25572,13 +25572,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.051513671875, "learning_rate": 0.0008893671108274877, - "loss": 0.0115, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4341026.0, "repeat_count": 0.0, - "routers_loss": 0.0022277699317783117, + "routers_loss": 0.0024757643695920706, "skip_count": 0.0, "step": 2692, "text_loss": 0.43402785062789917 @@ -25591,13 +25591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.043212890625, "learning_rate": 0.0008891728598186302, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 4344422.0, "repeat_count": 0.0, - "routers_loss": 0.003892304375767708, + "routers_loss": 0.003317243419587612, "skip_count": 0.0, "step": 2694, "text_loss": 0.8498559594154358 @@ -25610,13 +25610,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0380859375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008889784596779986, - "loss": 0.0092, + "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 4347507.0, "repeat_count": 0.0, - "routers_loss": 0.015058296732604504, + "routers_loss": 0.01577926240861416, "skip_count": 3.0, "step": 2696, "text_loss": 0.5646669864654541 @@ -25629,13 +25629,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.11328125, "learning_rate": 0.0008887839104800876, - "loss": 0.0118, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4350414.0, "repeat_count": 0.0, - "routers_loss": 0.0033561652526259422, + "routers_loss": 0.002953822258859873, "skip_count": 0.0, "step": 2698, "text_loss": 0.5145012140274048 @@ -25648,13 +25648,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.05029296875, "learning_rate": 0.0008885892122994486, - "loss": 0.0116, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4354110.0, "repeat_count": 0.0, - "routers_loss": 0.0062471418641507626, + "routers_loss": 0.005849295295774937, "skip_count": 0.0, "step": 2700, "text_loss": 0.580982506275177 @@ -25667,13 +25667,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.0419921875, "learning_rate": 0.0008883943652106903, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 4357323.0, "repeat_count": 1.0, - "routers_loss": 0.011802209541201591, + "routers_loss": 0.012347398325800896, "skip_count": 2.0, "step": 2702, "text_loss": 0.2234988808631897 @@ -25686,13 +25686,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.0008881993692884787, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 4360228.0, "repeat_count": 0.0, - "routers_loss": 0.0041528744623064995, + "routers_loss": 0.003574999049305916, "skip_count": 1.0, "step": 2704, "text_loss": 0.4261806607246399 @@ -25705,13 +25705,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.048828125, "learning_rate": 0.0008880042246075365, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4363905.0, "repeat_count": 0.0, - "routers_loss": 0.003151095937937498, + "routers_loss": 0.0031574300955981016, "skip_count": 0.0, "step": 2706, "text_loss": 0.691118061542511 @@ -25724,13 +25724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008878089312426433, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4366736.0, "repeat_count": 0.0, - "routers_loss": 0.003142676781862974, + "routers_loss": 0.003195564029738307, "skip_count": 0.0, "step": 2708, "text_loss": 0.613926112651825 @@ -25743,13 +25743,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.75, - "grad_norm": 0.05859375, + "grad_norm": 0.054443359375, "learning_rate": 0.0008876134892686363, "loss": 0.011, "macro_f1": 0.5694444179534912, "num_tokens": 4370146.0, "repeat_count": 0.0, - "routers_loss": 0.032964516431093216, + "routers_loss": 0.038784291595220566, "skip_count": 5.0, "step": 2710, "text_loss": 0.2723451852798462 @@ -25762,13 +25762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0830078125, "learning_rate": 0.000887417898760409, - "loss": 0.0123, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 4373653.0, "repeat_count": 0.0, - "routers_loss": 0.0006848900229670107, + "routers_loss": 0.0006457131239585578, "skip_count": 0.0, "step": 2712, "text_loss": 0.31667640805244446 @@ -25781,13 +25781,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.10498046875, "learning_rate": 0.000887222159792912, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6603773832321167, "num_tokens": 4376993.0, "repeat_count": 1.0, - "routers_loss": 0.04388813674449921, + "routers_loss": 0.045078590512275696, "skip_count": 1.0, "step": 2714, "text_loss": 0.5872798562049866 @@ -25800,13 +25800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.034912109375, "learning_rate": 0.0008870262724411528, - "loss": 0.0122, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4380160.0, "repeat_count": 0.0, - "routers_loss": 0.003538437420502305, + "routers_loss": 0.003628545207902789, "skip_count": 0.0, "step": 2716, "text_loss": 0.7468157410621643 @@ -25819,13 +25819,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1328125, + "grad_norm": 0.11181640625, "learning_rate": 0.0008868302367801962, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.6598639488220215, "num_tokens": 4383100.0, "repeat_count": 1.0, - "routers_loss": 0.05479869619011879, + "routers_loss": 0.05404464527964592, "skip_count": 3.0, "step": 2718, "text_loss": 0.2970244884490967 @@ -25838,13 +25838,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008866340528851629, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4386700.0, "repeat_count": 0.0, - "routers_loss": 0.0070296903140842915, + "routers_loss": 0.007000274024903774, "skip_count": 0.0, "step": 2720, "text_loss": 0.34521186351776123 @@ -25857,13 +25857,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05810546875, + "grad_norm": 0.052978515625, "learning_rate": 0.0008864377208312313, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.8823530077934265, "num_tokens": 4390299.0, "repeat_count": 1.0, - "routers_loss": 0.02051853947341442, + "routers_loss": 0.02025366574525833, "skip_count": 2.0, "step": 2722, "text_loss": 1.0536936521530151 @@ -25876,13 +25876,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.04638671875, "learning_rate": 0.000886241240693636, - "loss": 0.0096, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 4393353.0, "repeat_count": 0.0, - "routers_loss": 0.002662461483851075, + "routers_loss": 0.00251673418097198, "skip_count": 0.0, "step": 2724, "text_loss": 0.5678093433380127 @@ -25895,13 +25895,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.052001953125, "learning_rate": 0.0008860446125476686, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 4396446.0, "repeat_count": 1.0, - "routers_loss": 0.009321866557002068, + "routers_loss": 0.009532532654702663, "skip_count": 0.0, "step": 2726, "text_loss": 0.23775041103363037 @@ -25914,13 +25914,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.091796875, "learning_rate": 0.0008858478364686776, - "loss": 0.0102, + "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 4399977.0, "repeat_count": 1.0, - "routers_loss": 0.01029124017804861, + "routers_loss": 0.008062181062996387, "skip_count": 0.0, "step": 2728, "text_loss": 0.18888695538043976 @@ -25933,13 +25933,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.035888671875, "learning_rate": 0.0008856509125320678, - "loss": 0.0082, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4404406.0, "repeat_count": 0.0, - "routers_loss": 0.0008023424888961017, + "routers_loss": 0.0007731119985692203, "skip_count": 0.0, "step": 2730, "text_loss": 0.47331541776657104 @@ -25952,13 +25952,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.0498046875, "learning_rate": 0.0008854538408133006, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 4407165.0, "repeat_count": 0.0, - "routers_loss": 0.003058656118810177, + "routers_loss": 0.003115242812782526, "skip_count": 1.0, "step": 2732, "text_loss": 0.491370290517807 @@ -25971,13 +25971,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008852566213878947, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4410101.0, "repeat_count": 0.0, - "routers_loss": 0.0010282890871167183, + "routers_loss": 0.0008958528051152825, "skip_count": 0.0, "step": 2734, "text_loss": 0.42188262939453125 @@ -25990,13 +25990,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.07421875, + "grad_norm": 0.07763671875, "learning_rate": 0.0008850592543314246, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 1.0, "num_tokens": 4413015.0, "repeat_count": 1.0, - "routers_loss": 0.014785367995500565, + "routers_loss": 0.01139112375676632, "skip_count": 1.0, "step": 2736, "text_loss": 0.4716498553752899 @@ -26009,13 +26009,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0576171875, "learning_rate": 0.0008848617397195218, - "loss": 0.0089, + "loss": 0.0084, "macro_f1": 0.6603773832321167, "num_tokens": 4416404.0, "repeat_count": 1.0, - "routers_loss": 0.017717093229293823, + "routers_loss": 0.01609630137681961, "skip_count": 1.0, "step": 2738, "text_loss": 0.19490821659564972 @@ -26028,13 +26028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.041015625, "learning_rate": 0.0008846640776278745, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4419408.0, "repeat_count": 0.0, - "routers_loss": 0.0011861984385177493, + "routers_loss": 0.001489170710556209, "skip_count": 0.0, "step": 2740, "text_loss": 0.6443108320236206 @@ -26047,13 +26047,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0693359375, "learning_rate": 0.0008844662681322269, "loss": 0.0144, "macro_f1": 0.6666666865348816, "num_tokens": 4422067.0, "repeat_count": 1.0, - "routers_loss": 0.0013843412743881345, + "routers_loss": 0.0014755792217329144, "skip_count": 0.0, "step": 2742, "text_loss": 0.9150356650352478 @@ -26068,11 +26068,11 @@ "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0008842683113083801, - "loss": 0.0154, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 4425647.0, "repeat_count": 0.0, - "routers_loss": 0.010318896733224392, + "routers_loss": 0.008962674997746944, "skip_count": 1.0, "step": 2744, "text_loss": 0.7103227972984314 @@ -26085,13 +26085,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0751953125, "learning_rate": 0.0008840702072321915, - "loss": 0.0108, + "loss": 0.0104, "macro_f1": 0.6598639488220215, "num_tokens": 4428855.0, "repeat_count": 1.0, - "routers_loss": 0.029359478503465652, + "routers_loss": 0.02554207295179367, "skip_count": 3.0, "step": 2746, "text_loss": 0.27141591906547546 @@ -26104,13 +26104,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0234375, + "grad_norm": 0.0230712890625, "learning_rate": 0.0008838719559795751, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4432838.0, "repeat_count": 0.0, - "routers_loss": 0.0014995118835940957, + "routers_loss": 0.0011747616808861494, "skip_count": 0.0, "step": 2748, "text_loss": 0.4007738530635834 @@ -26123,13 +26123,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.03515625, + "grad_norm": 0.03466796875, "learning_rate": 0.0008836735576265009, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.5492662787437439, "num_tokens": 4435793.0, "repeat_count": 0.0, - "routers_loss": 0.017950648441910744, + "routers_loss": 0.017564335837960243, "skip_count": 2.0, "step": 2750, "text_loss": 0.5972410440444946 @@ -26142,13 +26142,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.044921875, "learning_rate": 0.0008834750122489956, - "loss": 0.0083, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 4438871.0, "repeat_count": 1.0, - "routers_loss": 0.0069067892618477345, + "routers_loss": 0.007004009559750557, "skip_count": 0.0, "step": 2752, "text_loss": 0.2294853925704956 @@ -26161,13 +26161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.06640625, "learning_rate": 0.0008832763199231423, - "loss": 0.0101, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4441846.0, "repeat_count": 0.0, - "routers_loss": 0.0013944554375484586, + "routers_loss": 0.0014562139986082911, "skip_count": 0.0, "step": 2754, "text_loss": 0.722432017326355 @@ -26180,13 +26180,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0751953125, "learning_rate": 0.0008830774807250802, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 4444786.0, "repeat_count": 1.0, - "routers_loss": 0.025158623233437538, + "routers_loss": 0.024773593991994858, "skip_count": 0.0, "step": 2756, "text_loss": 0.507905125617981 @@ -26199,13 +26199,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05419921875, + "grad_norm": 0.049072265625, "learning_rate": 0.0008828784947310049, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 0.8823530077934265, "num_tokens": 4448442.0, "repeat_count": 1.0, - "routers_loss": 0.05205477401614189, + "routers_loss": 0.04959975928068161, "skip_count": 2.0, "step": 2758, "text_loss": 0.3617522418498993 @@ -26218,13 +26218,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1025390625, "learning_rate": 0.000882679362017168, "loss": 0.0149, "macro_f1": 1.0, "num_tokens": 4451401.0, "repeat_count": 1.0, - "routers_loss": 0.005898742936551571, + "routers_loss": 0.005783245898783207, "skip_count": 2.0, "step": 2760, "text_loss": 0.49187400937080383 @@ -26237,13 +26237,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0008824800826598778, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 4454537.0, "repeat_count": 0.0, - "routers_loss": 0.006758298724889755, + "routers_loss": 0.00656260596588254, "skip_count": 0.0, "step": 2762, "text_loss": 0.6823583245277405 @@ -26256,13 +26256,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.0546875, "learning_rate": 0.0008822806567354983, - "loss": 0.0109, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4457706.0, "repeat_count": 1.0, - "routers_loss": 0.005730919074267149, + "routers_loss": 0.005298966076225042, "skip_count": 0.0, "step": 2764, "text_loss": 0.554322361946106 @@ -26275,13 +26275,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.046630859375, "learning_rate": 0.0008820810843204501, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3272727429866791, "num_tokens": 4460710.0, "repeat_count": 0.0, - "routers_loss": 0.03390989825129509, + "routers_loss": 0.03164982795715332, "skip_count": 1.0, "step": 2766, "text_loss": 0.1656961441040039 @@ -26294,13 +26294,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.072265625, "learning_rate": 0.0008818813654912095, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4464001.0, "repeat_count": 0.0, - "routers_loss": 0.0007058497285470366, + "routers_loss": 0.000715116853825748, "skip_count": 0.0, "step": 2768, "text_loss": 0.5818144083023071 @@ -26313,13 +26313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.056396484375, "learning_rate": 0.0008816815003243093, - "loss": 0.0136, + "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 4467364.0, "repeat_count": 0.0, - "routers_loss": 0.0027468691114336252, + "routers_loss": 0.002851625671610236, "skip_count": 0.0, "step": 2770, "text_loss": 0.6068631410598755 @@ -26332,13 +26332,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033203125, "learning_rate": 0.0008814814888963383, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4470681.0, "repeat_count": 0.0, - "routers_loss": 0.00443003186956048, + "routers_loss": 0.004729873035103083, "skip_count": 1.0, "step": 2772, "text_loss": 0.5386646389961243 @@ -26351,13 +26351,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04296875, "learning_rate": 0.000881281331283941, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4473734.0, "repeat_count": 0.0, - "routers_loss": 0.0031219064258038998, + "routers_loss": 0.0031853127293288708, "skip_count": 1.0, "step": 2774, "text_loss": 0.5695263147354126 @@ -26370,13 +26370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.033447265625, "learning_rate": 0.0008810810275638182, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4478404.0, "repeat_count": 0.0, - "routers_loss": 0.000846695271320641, + "routers_loss": 0.0008977465913631022, "skip_count": 0.0, "step": 2776, "text_loss": 0.4750773310661316 @@ -26389,13 +26389,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008808805778127269, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4481287.0, "repeat_count": 0.0, - "routers_loss": 0.0074167875573039055, + "routers_loss": 0.00469845999032259, "skip_count": 0.0, "step": 2778, "text_loss": 0.14078612625598907 @@ -26408,13 +26408,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04296875, + "grad_norm": 0.049560546875, "learning_rate": 0.0008806799821074796, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 4483929.0, "repeat_count": 0.0, - "routers_loss": 0.018358726054430008, + "routers_loss": 0.01789761893451214, "skip_count": 2.0, "step": 2780, "text_loss": 0.2167191207408905 @@ -26427,13 +26427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0008804792405249451, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 4487468.0, "repeat_count": 0.0, - "routers_loss": 0.001094152103178203, + "routers_loss": 0.001018838956952095, "skip_count": 0.0, "step": 2782, "text_loss": 0.5424665212631226 @@ -26446,13 +26446,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0498046875, + "grad_norm": 0.07373046875, "learning_rate": 0.000880278353142048, - "loss": 0.0075, + "loss": 0.0077, "macro_f1": 0.8200000524520874, "num_tokens": 4490942.0, "repeat_count": 1.0, - "routers_loss": 0.03035641834139824, + "routers_loss": 0.03260354697704315, "skip_count": 3.0, "step": 2784, "text_loss": 0.20994654297828674 @@ -26465,13 +26465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.05322265625, "learning_rate": 0.0008800773200357683, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4493986.0, "repeat_count": 0.0, - "routers_loss": 0.002394269686192274, + "routers_loss": 0.003019835101440549, "skip_count": 0.0, "step": 2786, "text_loss": 0.5709528923034668 @@ -26484,13 +26484,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.034423828125, "learning_rate": 0.0008798761412831429, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4498232.0, "repeat_count": 0.0, - "routers_loss": 0.0028274122159928083, + "routers_loss": 0.00285192858427763, "skip_count": 0.0, "step": 2788, "text_loss": 0.5103896260261536 @@ -26503,13 +26503,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.044921875, "learning_rate": 0.0008796748169612634, - "loss": 0.0088, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4501231.0, "repeat_count": 0.0, - "routers_loss": 0.0012642849469557405, + "routers_loss": 0.0012469831854104996, "skip_count": 0.0, "step": 2790, "text_loss": 0.43669697642326355 @@ -26522,13 +26522,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.039794921875, "learning_rate": 0.0008794733471472778, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4504208.0, "repeat_count": 0.0, - "routers_loss": 0.010966303758323193, + "routers_loss": 0.011512776836752892, "skip_count": 1.0, "step": 2792, "text_loss": 0.2299770563840866 @@ -26541,13 +26541,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.03564453125, "learning_rate": 0.0008792717319183899, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4507013.0, "repeat_count": 0.0, - "routers_loss": 0.008194026537239552, + "routers_loss": 0.00834917277097702, "skip_count": 0.0, "step": 2794, "text_loss": 0.2130603939294815 @@ -26560,13 +26560,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.03076171875, "learning_rate": 0.0008790699713518587, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4510286.0, "repeat_count": 0.0, - "routers_loss": 0.008828429505228996, + "routers_loss": 0.008616939187049866, "skip_count": 2.0, "step": 2796, "text_loss": 0.4377101957798004 @@ -26579,13 +26579,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.02783203125, "learning_rate": 0.0008788680655249994, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4513762.0, "repeat_count": 0.0, - "routers_loss": 0.0038230866193771362, + "routers_loss": 0.003408568911254406, "skip_count": 0.0, "step": 2798, "text_loss": 0.435138463973999 @@ -26598,13 +26598,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03369140625, "learning_rate": 0.0008786660145151826, - "loss": 0.009, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4516696.0, "repeat_count": 1.0, - "routers_loss": 0.0031088131945580244, + "routers_loss": 0.0029398901388049126, "skip_count": 0.0, "step": 2800, "text_loss": 0.3195655047893524 @@ -26617,13 +26617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.033203125, "learning_rate": 0.0008784638183998348, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4519760.0, "repeat_count": 0.0, - "routers_loss": 0.0014194221002981067, + "routers_loss": 0.0013777425047010183, "skip_count": 0.0, "step": 2802, "text_loss": 0.8129430413246155 @@ -26636,13 +26636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.032470703125, "learning_rate": 0.0008782614772564379, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4522106.0, "repeat_count": 0.0, - "routers_loss": 0.0031931858975440264, + "routers_loss": 0.0031694830395281315, "skip_count": 0.0, "step": 2804, "text_loss": 0.18083660304546356 @@ -26655,13 +26655,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.064453125, "learning_rate": 0.0008780589911625293, - "loss": 0.0117, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4525743.0, "repeat_count": 0.0, - "routers_loss": 0.0021834284998476505, + "routers_loss": 0.002161208540201187, "skip_count": 0.0, "step": 2806, "text_loss": 0.8228182792663574 @@ -26674,13 +26674,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0703125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008778563601957021, - "loss": 0.0098, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 4529573.0, "repeat_count": 0.0, - "routers_loss": 0.0035390176344662905, + "routers_loss": 0.0028444856870919466, "skip_count": 1.0, "step": 2808, "text_loss": 0.3715563118457794 @@ -26693,13 +26693,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008776535844336049, - "loss": 0.0095, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4532452.0, "repeat_count": 0.0, - "routers_loss": 0.0038604713045060635, + "routers_loss": 0.003807213855907321, "skip_count": 0.0, "step": 2810, "text_loss": 0.6012523174285889 @@ -26712,13 +26712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.0361328125, "learning_rate": 0.0008774506639539417, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4536077.0, "repeat_count": 0.0, - "routers_loss": 0.00669970503076911, + "routers_loss": 0.006698979996144772, "skip_count": 0.0, "step": 2812, "text_loss": 0.27097949385643005 @@ -26731,13 +26731,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0008772475988344722, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 4539057.0, "repeat_count": 0.0, - "routers_loss": 0.004594485275447369, + "routers_loss": 0.004849409218877554, "skip_count": 1.0, "step": 2814, "text_loss": 1.026973843574524 @@ -26750,13 +26750,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.04638671875, + "grad_norm": 0.041748046875, "learning_rate": 0.0008770443891530109, - "loss": 0.0116, + "loss": 0.0115, "macro_f1": 0.5934640765190125, "num_tokens": 4542253.0, "repeat_count": 0.0, - "routers_loss": 0.01891930215060711, + "routers_loss": 0.019148651510477066, "skip_count": 3.0, "step": 2816, "text_loss": 0.2717585563659668 @@ -26769,13 +26769,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052490234375, "learning_rate": 0.0008768410349874286, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 4545047.0, "repeat_count": 1.0, - "routers_loss": 0.0247862096875906, + "routers_loss": 0.02231316640973091, "skip_count": 2.0, "step": 2818, "text_loss": 0.274346262216568 @@ -26788,13 +26788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008766375364156508, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4548371.0, "repeat_count": 0.0, - "routers_loss": 0.008566800504922867, + "routers_loss": 0.008014129474759102, "skip_count": 2.0, "step": 2820, "text_loss": 0.22850871086120605 @@ -26807,13 +26807,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.044189453125, "learning_rate": 0.0008764338935156586, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4551276.0, "repeat_count": 0.0, - "routers_loss": 0.0013546474510803819, + "routers_loss": 0.0014544493751600385, "skip_count": 0.0, "step": 2822, "text_loss": 0.6308462023735046 @@ -26826,13 +26826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0390625, "learning_rate": 0.000876230106365488, - "loss": 0.0122, + "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 4554143.0, "repeat_count": 0.0, - "routers_loss": 0.009204468689858913, + "routers_loss": 0.00818584579974413, "skip_count": 3.0, "step": 2824, "text_loss": 0.3484207093715668 @@ -26845,13 +26845,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0264892578125, "learning_rate": 0.0008760261750432312, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4557256.0, "repeat_count": 0.0, - "routers_loss": 0.00787584763020277, + "routers_loss": 0.006275608204305172, "skip_count": 3.0, "step": 2826, "text_loss": 0.1927330046892166 @@ -26864,13 +26864,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0380859375, "learning_rate": 0.0008758220996270348, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 4560202.0, "repeat_count": 2.0, - "routers_loss": 0.0057869357988238335, + "routers_loss": 0.0055974251590669155, "skip_count": 2.0, "step": 2828, "text_loss": 0.7796496748924255 @@ -26883,13 +26883,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008756178801951007, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 4563508.0, "repeat_count": 0.0, - "routers_loss": 0.0018274546600878239, + "routers_loss": 0.0019799957517534494, "skip_count": 0.0, "step": 2830, "text_loss": 0.49633297324180603 @@ -26902,13 +26902,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0458984375, "learning_rate": 0.0008754135168256865, - "loss": 0.0094, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4566776.0, "repeat_count": 0.0, - "routers_loss": 0.004527154844254255, + "routers_loss": 0.004538947716355324, "skip_count": 0.0, "step": 2832, "text_loss": 0.5346745252609253 @@ -26921,13 +26921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03857421875, "learning_rate": 0.0008752090095971044, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4569787.0, "repeat_count": 0.0, - "routers_loss": 0.0018263199599459767, + "routers_loss": 0.001663343166001141, "skip_count": 0.0, "step": 2834, "text_loss": 0.5524004697799683 @@ -26940,13 +26940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.07373046875, "learning_rate": 0.000875004358587722, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4572813.0, "repeat_count": 0.0, - "routers_loss": 0.0022649941965937614, + "routers_loss": 0.0022988212294876575, "skip_count": 0.0, "step": 2836, "text_loss": 0.4232870042324066 @@ -26959,13 +26959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038330078125, "learning_rate": 0.000874799563875962, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4575563.0, "repeat_count": 0.0, - "routers_loss": 0.00791149027645588, + "routers_loss": 0.007781553082168102, "skip_count": 1.0, "step": 2838, "text_loss": 0.19239822030067444 @@ -26978,13 +26978,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0341796875, + "grad_norm": 0.03515625, "learning_rate": 0.0008745946255403021, "loss": 0.0072, "macro_f1": 0.5492662787437439, "num_tokens": 4578117.0, "repeat_count": 0.0, - "routers_loss": 0.016813624650239944, + "routers_loss": 0.01872488670051098, "skip_count": 2.0, "step": 2840, "text_loss": 0.2148810178041458 @@ -26999,11 +26999,11 @@ "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008743895436592749, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 4582330.0, "repeat_count": 1.0, - "routers_loss": 0.004429332446306944, + "routers_loss": 0.005634195636957884, "skip_count": 1.0, "step": 2842, "text_loss": 0.4929640591144562 @@ -27016,13 +27016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.048583984375, "learning_rate": 0.0008741843183114685, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4585765.0, "repeat_count": 0.0, - "routers_loss": 0.0007147722644731402, + "routers_loss": 0.0008928569150157273, "skip_count": 0.0, "step": 2844, "text_loss": 0.32702967524528503 @@ -27035,13 +27035,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044189453125, + "grad_norm": 0.0439453125, "learning_rate": 0.0008739789495755253, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4589000.0, "repeat_count": 0.0, - "routers_loss": 0.015438012778759003, + "routers_loss": 0.014715569093823433, "skip_count": 4.0, "step": 2846, "text_loss": 0.25125816464424133 @@ -27054,13 +27054,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.049560546875, "learning_rate": 0.0008737734375301433, - "loss": 0.0138, + "loss": 0.0135, "macro_f1": 0.3333333432674408, "num_tokens": 4592391.0, "repeat_count": 0.0, - "routers_loss": 0.0015892626252025366, + "routers_loss": 0.0017551190685480833, "skip_count": 0.0, "step": 2848, "text_loss": 0.6595172882080078 @@ -27073,13 +27073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.027099609375, "learning_rate": 0.0008735677822540749, - "loss": 0.0086, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4596662.0, "repeat_count": 0.0, - "routers_loss": 0.0006934175617061555, + "routers_loss": 0.0006456313421949744, "skip_count": 0.0, "step": 2850, "text_loss": 0.6290773153305054 @@ -27092,13 +27092,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.036865234375, "learning_rate": 0.0008733619838261276, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4599682.0, "repeat_count": 0.0, - "routers_loss": 0.006811433006078005, + "routers_loss": 0.00765060493722558, "skip_count": 2.0, "step": 2852, "text_loss": 0.3268161416053772 @@ -27111,13 +27111,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.044921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008731560423251637, - "loss": 0.0104, + "loss": 0.01, "macro_f1": 1.0, "num_tokens": 4603324.0, "repeat_count": 1.0, - "routers_loss": 0.012574959546327591, + "routers_loss": 0.01161442045122385, "skip_count": 2.0, "step": 2854, "text_loss": 0.3029932975769043 @@ -27130,13 +27130,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.038818359375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008729499578301005, "loss": 0.0098, "macro_f1": 0.9555556178092957, "num_tokens": 4606975.0, "repeat_count": 1.0, - "routers_loss": 0.01913273334503174, + "routers_loss": 0.02055389992892742, "skip_count": 5.0, "step": 2856, "text_loss": 0.6268532872200012 @@ -27149,13 +27149,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.05078125, "learning_rate": 0.00087274373041991, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4609629.0, "repeat_count": 0.0, - "routers_loss": 0.0012737065553665161, + "routers_loss": 0.0013911726418882608, "skip_count": 0.0, "step": 2858, "text_loss": 0.534355640411377 @@ -27168,13 +27168,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.053955078125, "learning_rate": 0.0008725373601736188, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4612913.0, "repeat_count": 2.0, - "routers_loss": 0.009088932536542416, + "routers_loss": 0.01010701060295105, "skip_count": 0.0, "step": 2860, "text_loss": 0.3391380310058594 @@ -27187,13 +27187,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.0255126953125, "learning_rate": 0.0008723308471703085, - "loss": 0.0078, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4616718.0, "repeat_count": 0.0, - "routers_loss": 0.006364458240568638, + "routers_loss": 0.005969462916254997, "skip_count": 1.0, "step": 2862, "text_loss": 0.47250816226005554 @@ -27206,13 +27206,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.046630859375, "learning_rate": 0.0008721241914891152, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4619680.0, "repeat_count": 0.0, - "routers_loss": 0.002686808817088604, + "routers_loss": 0.0027780034579336643, "skip_count": 0.0, "step": 2864, "text_loss": 0.3249278664588928 @@ -27225,13 +27225,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008719173932092295, - "loss": 0.0047, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4622700.0, "repeat_count": 0.0, - "routers_loss": 0.0018892486114054918, + "routers_loss": 0.0015912104863673449, "skip_count": 0.0, "step": 2866, "text_loss": 0.7789985537528992 @@ -27244,13 +27244,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.05126953125, "learning_rate": 0.0008717104524098973, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4626637.0, "repeat_count": 0.0, - "routers_loss": 0.0035258810967206955, + "routers_loss": 0.0036539011634886265, "skip_count": 0.0, "step": 2868, "text_loss": 0.619088351726532 @@ -27263,13 +27263,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10400390625, "learning_rate": 0.0008715033691704187, - "loss": 0.0121, + "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 4629863.0, "repeat_count": 0.0, - "routers_loss": 0.007305602077394724, + "routers_loss": 0.008402476087212563, "skip_count": 1.0, "step": 2870, "text_loss": 0.5550018548965454 @@ -27282,13 +27282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06298828125, "learning_rate": 0.0008712961435701479, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 4632657.0, "repeat_count": 0.0, - "routers_loss": 0.012898211367428303, + "routers_loss": 0.01400839351117611, "skip_count": 1.0, "step": 2872, "text_loss": 0.17368625104427338 @@ -27301,13 +27301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008710887756884947, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4635885.0, "repeat_count": 0.0, - "routers_loss": 0.0013437134912237525, + "routers_loss": 0.0014573842054232955, "skip_count": 0.0, "step": 2874, "text_loss": 0.5138643383979797 @@ -27320,13 +27320,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.033447265625, "learning_rate": 0.0008708812656049225, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 4639341.0, "repeat_count": 0.0, - "routers_loss": 0.002090727211907506, + "routers_loss": 0.002810224425047636, "skip_count": 1.0, "step": 2876, "text_loss": 0.70310378074646 @@ -27341,11 +27341,11 @@ "f1_skip": 0.8571428656578064, "grad_norm": 0.03564453125, "learning_rate": 0.0008706736133989497, - "loss": 0.0107, + "loss": 0.0105, "macro_f1": 0.9449735879898071, "num_tokens": 4642163.0, "repeat_count": 2.0, - "routers_loss": 0.030176319181919098, + "routers_loss": 0.029783209785819054, "skip_count": 4.0, "step": 2878, "text_loss": 0.26898008584976196 @@ -27358,13 +27358,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04150390625, "learning_rate": 0.0008704658191501491, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4645858.0, "repeat_count": 0.0, - "routers_loss": 0.0009633690933696926, + "routers_loss": 0.0009193966398015618, "skip_count": 0.0, "step": 2880, "text_loss": 0.6047570705413818 @@ -27377,13 +27377,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.060302734375, + "grad_norm": 0.05908203125, "learning_rate": 0.0008702578829381475, "loss": 0.0131, "macro_f1": 0.8814815282821655, "num_tokens": 4649237.0, "repeat_count": 2.0, - "routers_loss": 0.0568491593003273, + "routers_loss": 0.05698608607053757, "skip_count": 4.0, "step": 2882, "text_loss": 0.10695219784975052 @@ -27396,13 +27396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.0311279296875, "learning_rate": 0.0008700498048426269, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4652362.0, "repeat_count": 0.0, - "routers_loss": 0.0012279651127755642, + "routers_loss": 0.0011786938412114978, "skip_count": 0.0, "step": 2884, "text_loss": 0.4442957937717438 @@ -27415,13 +27415,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008698415849433229, - "loss": 0.0097, + "loss": 0.0092, "macro_f1": 0.5492662787437439, "num_tokens": 4655616.0, "repeat_count": 2.0, - "routers_loss": 0.02166076935827732, + "routers_loss": 0.02142646163702011, "skip_count": 0.0, "step": 2886, "text_loss": 0.5820964574813843 @@ -27434,13 +27434,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008696332233200262, - "loss": 0.012, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4659294.0, "repeat_count": 0.0, - "routers_loss": 0.003944257274270058, + "routers_loss": 0.004038636106997728, "skip_count": 0.0, "step": 2888, "text_loss": 0.11847645789384842 @@ -27453,13 +27453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008694247200525806, - "loss": 0.0092, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4662512.0, "repeat_count": 0.0, - "routers_loss": 0.0013393335975706577, + "routers_loss": 0.0013256469974294305, "skip_count": 0.0, "step": 2890, "text_loss": 0.4873582720756531 @@ -27472,13 +27472,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008692160752208856, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3272727429866791, "num_tokens": 4666190.0, "repeat_count": 0.0, - "routers_loss": 0.0443510003387928, + "routers_loss": 0.04477972164750099, "skip_count": 1.0, "step": 2892, "text_loss": 0.44243401288986206 @@ -27491,13 +27491,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.083984375, + "grad_norm": 0.09521484375, "learning_rate": 0.0008690072889048941, - "loss": 0.0125, + "loss": 0.0127, "macro_f1": 1.0, "num_tokens": 4668884.0, "repeat_count": 1.0, - "routers_loss": 0.0047337980940938, + "routers_loss": 0.004407547414302826, "skip_count": 2.0, "step": 2894, "text_loss": 0.6847127079963684 @@ -27510,13 +27510,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.04052734375, "learning_rate": 0.0008687983611846133, - "loss": 0.0082, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4672093.0, "repeat_count": 0.0, - "routers_loss": 0.0055244253017008305, + "routers_loss": 0.005245382897555828, "skip_count": 1.0, "step": 2896, "text_loss": 0.25583332777023315 @@ -27529,13 +27529,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.0458984375, "learning_rate": 0.0008685892921401049, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4674917.0, "repeat_count": 0.0, - "routers_loss": 0.001250729663297534, + "routers_loss": 0.0010470855049788952, "skip_count": 0.0, "step": 2898, "text_loss": 0.41998377442359924 @@ -27548,13 +27548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.037841796875, "learning_rate": 0.0008683800818514844, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4677739.0, "repeat_count": 0.0, - "routers_loss": 0.00974183902144432, + "routers_loss": 0.009026622399687767, "skip_count": 2.0, "step": 2900, "text_loss": 0.303053081035614 @@ -27567,13 +27567,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.09619140625, "learning_rate": 0.0008681707303989215, - "loss": 0.0111, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4680721.0, "repeat_count": 0.0, - "routers_loss": 0.004882345907390118, + "routers_loss": 0.004500916693359613, "skip_count": 0.0, "step": 2902, "text_loss": 0.5573288798332214 @@ -27586,13 +27586,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008679612378626404, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4683339.0, "repeat_count": 0.0, - "routers_loss": 0.00568242697045207, + "routers_loss": 0.005047840531915426, "skip_count": 1.0, "step": 2904, "text_loss": 0.321353554725647 @@ -27605,13 +27605,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.03271484375, "learning_rate": 0.0008677516043229187, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4686453.0, "repeat_count": 0.0, - "routers_loss": 0.010831202380359173, + "routers_loss": 0.010256914421916008, "skip_count": 1.0, "step": 2906, "text_loss": 0.4300784468650818 @@ -27624,13 +27624,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0008675418298600883, - "loss": 0.0087, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4689645.0, "repeat_count": 1.0, - "routers_loss": 0.00235295994207263, + "routers_loss": 0.0022669637110084295, "skip_count": 0.0, "step": 2908, "text_loss": 0.5064885020256042 @@ -27643,13 +27643,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.048828125, "learning_rate": 0.0008673319145545358, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4692320.0, "repeat_count": 0.0, - "routers_loss": 0.0011642680037766695, + "routers_loss": 0.0011188550852239132, "skip_count": 0.0, "step": 2910, "text_loss": 0.7114819884300232 @@ -27662,13 +27662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03369140625, "learning_rate": 0.0008671218584867003, - "loss": 0.0104, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4695116.0, "repeat_count": 0.0, - "routers_loss": 0.00278888875618577, + "routers_loss": 0.002966561820358038, "skip_count": 2.0, "step": 2912, "text_loss": 0.5662392973899841 @@ -27681,13 +27681,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.047607421875, "learning_rate": 0.0008669116617370762, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4698040.0, "repeat_count": 0.0, - "routers_loss": 0.0014630162622779608, + "routers_loss": 0.0012894890969619155, "skip_count": 0.0, "step": 2914, "text_loss": 0.718977689743042 @@ -27700,13 +27700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.1552734375, "learning_rate": 0.0008667013243862111, - "loss": 0.0159, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4700963.0, "repeat_count": 0.0, - "routers_loss": 0.0011393720051273704, + "routers_loss": 0.0007232456118799746, "skip_count": 0.0, "step": 2916, "text_loss": 0.3447718024253845 @@ -27719,13 +27719,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02978515625, + "grad_norm": 0.0289306640625, "learning_rate": 0.000866490846514707, - "loss": 0.0072, + "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 4704471.0, "repeat_count": 1.0, - "routers_loss": 0.014218449592590332, + "routers_loss": 0.015166680328547955, "skip_count": 0.0, "step": 2918, "text_loss": 0.454946368932724 @@ -27738,13 +27738,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.04736328125, "learning_rate": 0.000866280228203219, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4707238.0, "repeat_count": 1.0, - "routers_loss": 0.005367610137909651, + "routers_loss": 0.0061312485486269, "skip_count": 1.0, "step": 2920, "text_loss": 0.721788227558136 @@ -27757,13 +27757,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048828125, + "grad_norm": 0.055908203125, "learning_rate": 0.0008660694695324564, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4711323.0, "repeat_count": 0.0, - "routers_loss": 0.0020303199999034405, + "routers_loss": 0.00169933564029634, "skip_count": 0.0, "step": 2922, "text_loss": 0.7562121748924255 @@ -27776,13 +27776,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0654296875, "learning_rate": 0.0008658585705831829, - "loss": 0.0123, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4714417.0, "repeat_count": 0.0, - "routers_loss": 0.0022230520844459534, + "routers_loss": 0.0022731393110007048, "skip_count": 0.0, "step": 2924, "text_loss": 0.5726147890090942 @@ -27795,13 +27795,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.068359375, "learning_rate": 0.0008656475314362148, - "loss": 0.0133, + "loss": 0.0131, "macro_f1": 0.8817967176437378, "num_tokens": 4717445.0, "repeat_count": 2.0, - "routers_loss": 0.06414645165205002, + "routers_loss": 0.06477782875299454, "skip_count": 3.0, "step": 2926, "text_loss": 0.4505867660045624 @@ -27814,13 +27814,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0008654363521724229, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.9449735879898071, "num_tokens": 4722253.0, "repeat_count": 2.0, - "routers_loss": 0.022727061063051224, + "routers_loss": 0.027405790984630585, "skip_count": 4.0, "step": 2928, "text_loss": 0.24767601490020752 @@ -27833,13 +27833,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.0537109375, "learning_rate": 0.0008652250328727315, - "loss": 0.0114, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4725465.0, "repeat_count": 0.0, - "routers_loss": 0.006181784905493259, + "routers_loss": 0.006544729229062796, "skip_count": 2.0, "step": 2930, "text_loss": 0.4478724002838135 @@ -27852,13 +27852,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0517578125, "learning_rate": 0.0008650135736181184, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4729213.0, "repeat_count": 1.0, - "routers_loss": 0.005527070257812738, + "routers_loss": 0.0055119614116847515, "skip_count": 0.0, "step": 2932, "text_loss": 0.6749323010444641 @@ -27871,13 +27871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.045166015625, "learning_rate": 0.0008648019744896154, - "loss": 0.0102, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4732280.0, "repeat_count": 0.0, - "routers_loss": 0.008868738077580929, + "routers_loss": 0.008374541997909546, "skip_count": 0.0, "step": 2934, "text_loss": 0.4647359251976013 @@ -27890,13 +27890,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.06201171875, "learning_rate": 0.0008645902355683077, - "loss": 0.0089, + "loss": 0.0091, "macro_f1": 0.6595745086669922, "num_tokens": 4736244.0, "repeat_count": 1.0, - "routers_loss": 0.07285884022712708, + "routers_loss": 0.068686343729496, "skip_count": 4.0, "step": 2936, "text_loss": 0.5356017351150513 @@ -27909,13 +27909,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.042236328125, "learning_rate": 0.0008643783569353339, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4739810.0, "repeat_count": 2.0, - "routers_loss": 0.019306030124425888, + "routers_loss": 0.017954571172595024, "skip_count": 0.0, "step": 2938, "text_loss": 0.3145926296710968 @@ -27928,13 +27928,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.054443359375, "learning_rate": 0.0008641663386718863, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4742720.0, "repeat_count": 0.0, - "routers_loss": 0.00626454409211874, + "routers_loss": 0.006261351052671671, "skip_count": 1.0, "step": 2940, "text_loss": 0.3200613856315613 @@ -27949,11 +27949,11 @@ "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008639541808592109, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 4745870.0, "repeat_count": 1.0, - "routers_loss": 0.0019172134343534708, + "routers_loss": 0.0025341357104480267, "skip_count": 1.0, "step": 2942, "text_loss": 0.5020416378974915 @@ -27968,11 +27968,11 @@ "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008637418835786067, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4748943.0, "repeat_count": 0.0, - "routers_loss": 0.009745351038873196, + "routers_loss": 0.008970048278570175, "skip_count": 2.0, "step": 2944, "text_loss": 0.14517110586166382 @@ -27985,13 +27985,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008635294469114265, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4751360.0, "repeat_count": 0.0, - "routers_loss": 0.0020624736789613962, + "routers_loss": 0.002133632078766823, "skip_count": 0.0, "step": 2946, "text_loss": 0.5367856025695801 @@ -28004,13 +28004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.08837890625, "learning_rate": 0.0008633168709390766, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4754403.0, "repeat_count": 0.0, - "routers_loss": 0.001082106726244092, + "routers_loss": 0.0011866620043292642, "skip_count": 0.0, "step": 2948, "text_loss": 0.38302522897720337 @@ -28023,13 +28023,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.037109375, "learning_rate": 0.0008631041557430163, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4757867.0, "repeat_count": 2.0, - "routers_loss": 0.0026527612935751677, + "routers_loss": 0.0026854004245251417, "skip_count": 0.0, "step": 2950, "text_loss": 0.43433454632759094 @@ -28042,13 +28042,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.05859375, "learning_rate": 0.0008628913014047585, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4761171.0, "repeat_count": 0.0, - "routers_loss": 0.0027245471719652414, + "routers_loss": 0.002433479530736804, "skip_count": 0.0, "step": 2952, "text_loss": 0.4725971519947052 @@ -28061,13 +28061,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.028564453125, "learning_rate": 0.0008626783080058696, - "loss": 0.0065, + "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 4764752.0, "repeat_count": 1.0, - "routers_loss": 0.01764744706451893, + "routers_loss": 0.017182493582367897, "skip_count": 0.0, "step": 2954, "text_loss": 0.460641473531723 @@ -28080,13 +28080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0008624651756279687, - "loss": 0.0196, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 4767453.0, "repeat_count": 0.0, - "routers_loss": 0.0019560824148356915, + "routers_loss": 0.0018134774873033166, "skip_count": 0.0, "step": 2956, "text_loss": 0.4091459810733795 @@ -28099,13 +28099,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.053466796875, "learning_rate": 0.000862251904352729, "loss": 0.0108, "macro_f1": 0.9259259104728699, "num_tokens": 4771110.0, "repeat_count": 3.0, - "routers_loss": 0.03031078353524208, + "routers_loss": 0.0365753099322319, "skip_count": 3.0, "step": 2958, "text_loss": 0.22408585250377655 @@ -28118,13 +28118,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.05029296875, "learning_rate": 0.000862038494261876, "loss": 0.0109, "macro_f1": 0.3272727429866791, "num_tokens": 4774464.0, "repeat_count": 0.0, - "routers_loss": 0.024790454655885696, + "routers_loss": 0.024343067780137062, "skip_count": 1.0, "step": 2960, "text_loss": 0.16483014822006226 @@ -28137,13 +28137,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008618249454371891, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 4777894.0, "repeat_count": 0.0, - "routers_loss": 0.0008704765350557864, + "routers_loss": 0.0008310087723657489, "skip_count": 0.0, "step": 2962, "text_loss": 0.5573428869247437 @@ -28156,13 +28156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008616112579605006, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4781116.0, "repeat_count": 0.0, - "routers_loss": 0.0066874073818326, + "routers_loss": 0.0065494864247739315, "skip_count": 0.0, "step": 2964, "text_loss": 0.18816794455051422 @@ -28175,13 +28175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.04248046875, "learning_rate": 0.0008613974319136957, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4784886.0, "repeat_count": 0.0, - "routers_loss": 0.0021798228845000267, + "routers_loss": 0.0019726944155991077, "skip_count": 0.0, "step": 2966, "text_loss": 0.5097305774688721 @@ -28194,13 +28194,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.0849609375, "learning_rate": 0.0008611834673787134, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4787563.0, "repeat_count": 0.0, - "routers_loss": 0.0063707553781569, + "routers_loss": 0.006327496841549873, "skip_count": 0.0, "step": 2968, "text_loss": 0.6953814029693604 @@ -28213,13 +28213,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.5, "f1_skip": 1.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.056884765625, "learning_rate": 0.0008609693644375449, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.8200000524520874, "num_tokens": 4790421.0, "repeat_count": 3.0, - "routers_loss": 0.044509731233119965, + "routers_loss": 0.042896661907434464, "skip_count": 1.0, "step": 2970, "text_loss": 0.2573051154613495 @@ -28227,18 +28227,18 @@ { "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 13.953331376577633, - "f1_execute": 0.9795917868614197, + "f1_execute": 1.0, "f1_repeat": 1.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1640625, + "f1_skip": 1.0, + "grad_norm": 0.14453125, "learning_rate": 0.000860755123172235, - "loss": 0.01, - "macro_f1": 0.8820862174034119, + "loss": 0.0096, + "macro_f1": 1.0, "num_tokens": 4793786.0, "repeat_count": 2.0, - "routers_loss": 0.01667599380016327, + "routers_loss": 0.013228793628513813, "skip_count": 1.0, "step": 2972, "text_loss": 0.46614497900009155 @@ -28251,13 +28251,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0296630859375, "learning_rate": 0.0008605407436648815, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4796864.0, "repeat_count": 0.0, - "routers_loss": 0.008433761075139046, + "routers_loss": 0.007294759154319763, "skip_count": 2.0, "step": 2974, "text_loss": 0.21555091440677643 @@ -28270,13 +28270,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.057861328125, "learning_rate": 0.0008603262259976348, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 1.0, "num_tokens": 4800080.0, "repeat_count": 1.0, - "routers_loss": 0.002439796691760421, + "routers_loss": 0.0024024227168411016, "skip_count": 5.0, "step": 2976, "text_loss": 0.7855485081672668 @@ -28289,13 +28289,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.07666015625, "learning_rate": 0.0008601115702526987, - "loss": 0.0112, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4802899.0, "repeat_count": 0.0, - "routers_loss": 0.0015027766348794103, + "routers_loss": 0.001433031284250319, "skip_count": 0.0, "step": 2978, "text_loss": 0.6777765154838562 @@ -28308,13 +28308,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.04931640625, "learning_rate": 0.0008598967765123293, - "loss": 0.0091, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4805835.0, "repeat_count": 0.0, - "routers_loss": 0.003235677955672145, + "routers_loss": 0.003073975909501314, "skip_count": 0.0, "step": 2980, "text_loss": 0.5926910638809204 @@ -28322,18 +28322,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.0, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052734375, + "grad_norm": 0.05322265625, "learning_rate": 0.0008596818448588364, - "loss": 0.0141, - "macro_f1": 0.7474747896194458, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, "num_tokens": 4809028.0, "repeat_count": 1.0, - "routers_loss": 0.063179150223732, + "routers_loss": 0.06438573449850082, "skip_count": 6.0, "step": 2982, "text_loss": 0.23975612223148346 @@ -28346,13 +28346,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0302734375, "learning_rate": 0.0008594667753745821, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 4812831.0, "repeat_count": 0.0, - "routers_loss": 0.015444152988493443, + "routers_loss": 0.014817612245678902, "skip_count": 1.0, "step": 2984, "text_loss": 0.17292268574237823 @@ -28365,13 +28365,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008592515681419813, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 0.5492662787437439, "num_tokens": 4816005.0, "repeat_count": 2.0, - "routers_loss": 0.02485196851193905, + "routers_loss": 0.025407327339053154, "skip_count": 0.0, "step": 2986, "text_loss": 0.6403061151504517 @@ -28384,13 +28384,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008590362232435018, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4818901.0, "repeat_count": 0.0, - "routers_loss": 0.006175600457936525, + "routers_loss": 0.006826757453382015, "skip_count": 0.0, "step": 2988, "text_loss": 0.2572069466114044 @@ -28403,13 +28403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.04052734375, "learning_rate": 0.0008588207407616644, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4823120.0, "repeat_count": 0.0, - "routers_loss": 0.0008576468680985272, + "routers_loss": 0.0009054148104041815, "skip_count": 0.0, "step": 2990, "text_loss": 0.4827076196670532 @@ -28422,13 +28422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02392578125, + "grad_norm": 0.0247802734375, "learning_rate": 0.0008586051207790422, - "loss": 0.0059, + "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4825774.0, "repeat_count": 0.0, - "routers_loss": 0.0011548360344022512, + "routers_loss": 0.0012294676853343844, "skip_count": 0.0, "step": 2992, "text_loss": 0.40157821774482727 @@ -28441,13 +28441,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.056396484375, + "grad_norm": 0.052734375, "learning_rate": 0.0008583893633782612, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 4828841.0, "repeat_count": 0.0, - "routers_loss": 0.01307896338403225, + "routers_loss": 0.011474622413516045, "skip_count": 2.0, "step": 2994, "text_loss": 0.14842072129249573 @@ -28460,13 +28460,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.058837890625, "learning_rate": 0.0008581734686419999, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4831458.0, "repeat_count": 0.0, - "routers_loss": 0.009716883301734924, + "routers_loss": 0.009154081344604492, "skip_count": 2.0, "step": 2996, "text_loss": 0.365400105714798 @@ -28479,13 +28479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.031982421875, "learning_rate": 0.00085795743665299, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4834609.0, "repeat_count": 0.0, - "routers_loss": 0.0026114562060683966, + "routers_loss": 0.002899336162954569, "skip_count": 0.0, "step": 2998, "text_loss": 0.5574684143066406 @@ -28498,13 +28498,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.0517578125, "learning_rate": 0.0008577412674940152, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4838324.0, "repeat_count": 0.0, - "routers_loss": 0.003787368768826127, + "routers_loss": 0.0034664268605411053, "skip_count": 0.0, "step": 3000, "text_loss": 0.6752855777740479 @@ -28517,13 +28517,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.03466796875, "learning_rate": 0.0008575249612479117, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 4841877.0, "repeat_count": 0.0, - "routers_loss": 0.004202218260616064, + "routers_loss": 0.0036425739526748657, "skip_count": 2.0, "step": 3002, "text_loss": 0.6332980394363403 @@ -28536,13 +28536,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.048095703125, "learning_rate": 0.0008573085179975685, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4845840.0, "repeat_count": 0.0, - "routers_loss": 0.0012371218763291836, + "routers_loss": 0.0013783496106043458, "skip_count": 0.0, "step": 3004, "text_loss": 0.4219617545604706 @@ -28555,13 +28555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.03857421875, "learning_rate": 0.0008570919378259274, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4848766.0, "repeat_count": 0.0, - "routers_loss": 0.005013706628233194, + "routers_loss": 0.004823608323931694, "skip_count": 1.0, "step": 3006, "text_loss": 0.7987180948257446 @@ -28574,13 +28574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.0302734375, "learning_rate": 0.000856875220815982, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4852310.0, "repeat_count": 0.0, - "routers_loss": 0.001336073037236929, + "routers_loss": 0.0014760984340682626, "skip_count": 0.0, "step": 3008, "text_loss": 0.35592713952064514 @@ -28593,13 +28593,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.035400390625, "learning_rate": 0.0008566583670507788, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4856146.0, "repeat_count": 0.0, - "routers_loss": 0.003256940981373191, + "routers_loss": 0.0031717263627797365, "skip_count": 1.0, "step": 3010, "text_loss": 0.19379083812236786 @@ -28612,13 +28612,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.0517578125, "learning_rate": 0.0008564413766134164, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 4859386.0, "repeat_count": 0.0, - "routers_loss": 0.0038389062974601984, + "routers_loss": 0.003361492184922099, "skip_count": 0.0, "step": 3012, "text_loss": 0.39129266142845154 @@ -28631,13 +28631,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052734375, + "grad_norm": 0.048583984375, "learning_rate": 0.0008562242495870463, - "loss": 0.0119, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4862661.0, "repeat_count": 0.0, - "routers_loss": 0.0007799214799888432, + "routers_loss": 0.0010563990799710155, "skip_count": 0.0, "step": 3014, "text_loss": 0.5966938734054565 @@ -28650,13 +28650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0213623046875, + "grad_norm": 0.0234375, "learning_rate": 0.0008560069860548716, - "loss": 0.006, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4865410.0, "repeat_count": 0.0, - "routers_loss": 0.0010348912328481674, + "routers_loss": 0.001233913702890277, "skip_count": 0.0, "step": 3016, "text_loss": 0.3386077880859375 @@ -28669,13 +28669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.055419921875, "learning_rate": 0.0008557895861001484, - "loss": 0.006, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4868931.0, "repeat_count": 0.0, - "routers_loss": 0.0018167694797739387, + "routers_loss": 0.0018066301709041, "skip_count": 0.0, "step": 3018, "text_loss": 0.5222050547599792 @@ -28688,13 +28688,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.039306640625, "learning_rate": 0.0008555720498061845, - "loss": 0.0078, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4873492.0, "repeat_count": 0.0, - "routers_loss": 0.005788089707493782, + "routers_loss": 0.0050385501235723495, "skip_count": 1.0, "step": 3020, "text_loss": 0.4558849334716797 @@ -28707,13 +28707,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.048828125, "learning_rate": 0.0008553543772563403, - "loss": 0.0092, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4877026.0, "repeat_count": 0.0, - "routers_loss": 0.004194240085780621, + "routers_loss": 0.004828717093914747, "skip_count": 0.0, "step": 3022, "text_loss": 0.36598992347717285 @@ -28726,13 +28726,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.05712890625, + "grad_norm": 0.06103515625, "learning_rate": 0.0008551365685340285, "loss": 0.0084, "macro_f1": 0.9555556178092957, "num_tokens": 4879655.0, "repeat_count": 1.0, - "routers_loss": 0.019211066886782646, + "routers_loss": 0.02049369551241398, "skip_count": 5.0, "step": 3024, "text_loss": 0.5069093704223633 @@ -28745,13 +28745,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0478515625, + "grad_norm": 0.043212890625, "learning_rate": 0.0008549186237227138, - "loss": 0.0092, + "loss": 0.0088, "macro_f1": 0.8823530077934265, "num_tokens": 4882606.0, "repeat_count": 1.0, - "routers_loss": 0.041074834764003754, + "routers_loss": 0.03947242721915245, "skip_count": 2.0, "step": 3026, "text_loss": 0.2600715458393097 @@ -28764,13 +28764,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.030029296875, "learning_rate": 0.0008547005429059128, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4885246.0, "repeat_count": 2.0, - "routers_loss": 0.0027008953038603067, + "routers_loss": 0.0026363315992057323, "skip_count": 0.0, "step": 3028, "text_loss": 0.37642326951026917 @@ -28783,13 +28783,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.048828125, "learning_rate": 0.0008544823261671948, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4888109.0, "repeat_count": 0.0, - "routers_loss": 0.00402502017095685, + "routers_loss": 0.003858231008052826, "skip_count": 0.0, "step": 3030, "text_loss": 0.5875385999679565 @@ -28802,13 +28802,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.061279296875, "learning_rate": 0.0008542639735901804, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4891168.0, "repeat_count": 1.0, - "routers_loss": 0.00628731120377779, + "routers_loss": 0.004789089784026146, "skip_count": 1.0, "step": 3032, "text_loss": 0.6417325139045715 @@ -28821,32 +28821,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.035888671875, "learning_rate": 0.0008540454852585434, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4894355.0, "repeat_count": 0.0, - "routers_loss": 0.007284072227776051, + "routers_loss": 0.007334680762141943, "skip_count": 2.0, "step": 3034, "text_loss": 0.23697198927402496 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 14.253595538597006, - "f1_execute": 0.9803921580314636, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.033203125, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, "learning_rate": 0.0008538268612560084, - "loss": 0.0059, - "macro_f1": 0.5934640765190125, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, "num_tokens": 4897543.0, "repeat_count": 0.0, - "routers_loss": 0.020328659564256668, + "routers_loss": 0.022096361964941025, "skip_count": 3.0, "step": 3036, "text_loss": 0.1989550143480301 @@ -28859,13 +28859,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.047119140625, "learning_rate": 0.0008536081016663527, - "loss": 0.0102, + "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4900752.0, "repeat_count": 1.0, - "routers_loss": 0.002338571473956108, + "routers_loss": 0.0037680594250559807, "skip_count": 2.0, "step": 3038, "text_loss": 0.5001366138458252 @@ -28878,13 +28878,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.0400390625, "learning_rate": 0.0008533892065734055, - "loss": 0.0083, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4903581.0, "repeat_count": 0.0, - "routers_loss": 0.003033763263374567, + "routers_loss": 0.0032373068388551474, "skip_count": 1.0, "step": 3040, "text_loss": 0.5019411444664001 @@ -28897,13 +28897,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.042724609375, "learning_rate": 0.0008531701760610476, - "loss": 0.012, + "loss": 0.0121, "macro_f1": 1.0, "num_tokens": 4907108.0, "repeat_count": 1.0, - "routers_loss": 0.00831629242748022, + "routers_loss": 0.0078013185411691666, "skip_count": 2.0, "step": 3042, "text_loss": 0.3460627794265747 @@ -28916,13 +28916,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.04736328125, + "grad_norm": 0.04833984375, "learning_rate": 0.000852951010213212, - "loss": 0.0087, + "loss": 0.0089, "macro_f1": 0.8200000524520874, "num_tokens": 4911269.0, "repeat_count": 1.0, - "routers_loss": 0.03200878947973251, + "routers_loss": 0.03576689213514328, "skip_count": 3.0, "step": 3044, "text_loss": 0.268994003534317 @@ -28935,13 +28935,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.02685546875, "learning_rate": 0.0008527317091138835, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4914203.0, "repeat_count": 1.0, - "routers_loss": 0.003899211063981056, + "routers_loss": 0.0032140621915459633, "skip_count": 1.0, "step": 3046, "text_loss": 0.9998719692230225 @@ -28954,13 +28954,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.040771484375, "learning_rate": 0.0008525122728470987, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4918562.0, "repeat_count": 1.0, - "routers_loss": 0.00883556716144085, + "routers_loss": 0.008559177629649639, "skip_count": 3.0, "step": 3048, "text_loss": 0.3062439560890198 @@ -28973,13 +28973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03125, "learning_rate": 0.0008522927014969459, - "loss": 0.0064, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4921940.0, "repeat_count": 0.0, - "routers_loss": 0.009054492227733135, + "routers_loss": 0.008735597133636475, "skip_count": 2.0, "step": 3050, "text_loss": 0.3637430965900421 @@ -28992,13 +28992,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.05517578125, "learning_rate": 0.0008520729951475652, - "loss": 0.0082, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4925416.0, "repeat_count": 0.0, - "routers_loss": 0.0011907420121133327, + "routers_loss": 0.0012709591537714005, "skip_count": 0.0, "step": 3052, "text_loss": 0.542036235332489 @@ -29011,13 +29011,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06640625, "learning_rate": 0.0008518531538831488, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4928695.0, "repeat_count": 0.0, - "routers_loss": 0.0013618353987112641, + "routers_loss": 0.0010660928674042225, "skip_count": 1.0, "step": 3054, "text_loss": 0.43144503235816956 @@ -29030,13 +29030,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060546875, + "grad_norm": 0.059326171875, "learning_rate": 0.00085163317778794, - "loss": 0.0102, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4931504.0, "repeat_count": 0.0, - "routers_loss": 0.004202015232294798, + "routers_loss": 0.004558971151709557, "skip_count": 2.0, "step": 3056, "text_loss": 0.5257010459899902 @@ -29049,32 +29049,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.04931640625, "learning_rate": 0.0008514130669462341, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4934935.0, "repeat_count": 0.0, - "routers_loss": 0.01060314942151308, + "routers_loss": 0.010774781927466393, "skip_count": 2.0, "step": 3058, "text_loss": 0.26061776280403137 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.366304666862343, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.0390625, "learning_rate": 0.0008511928214423782, "loss": 0.0103, - "macro_f1": 1.0, + "macro_f1": 0.6601307392120361, "num_tokens": 4938047.0, "repeat_count": 1.0, - "routers_loss": 0.012400983832776546, + "routers_loss": 0.014763157814741135, "skip_count": 2.0, "step": 3060, "text_loss": 0.2856905460357666 @@ -29087,13 +29087,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.050048828125, "learning_rate": 0.0008509724413607705, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 4941041.0, "repeat_count": 1.0, - "routers_loss": 0.004353851079940796, + "routers_loss": 0.004613345488905907, "skip_count": 0.0, "step": 3062, "text_loss": 0.2870287001132965 @@ -29106,13 +29106,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.06298828125, "learning_rate": 0.0008507519267858612, - "loss": 0.0148, + "loss": 0.015, "macro_f1": 1.0, "num_tokens": 4944708.0, "repeat_count": 1.0, - "routers_loss": 0.009858032688498497, + "routers_loss": 0.008584189228713512, "skip_count": 2.0, "step": 3064, "text_loss": 0.15828095376491547 @@ -29125,13 +29125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.029052734375, "learning_rate": 0.0008505312778021519, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4948295.0, "repeat_count": 0.0, - "routers_loss": 0.0016502789221704006, + "routers_loss": 0.0014670816017314792, "skip_count": 0.0, "step": 3066, "text_loss": 0.36697930097579956 @@ -29144,13 +29144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0927734375, "learning_rate": 0.0008503104944941958, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 4951983.0, "repeat_count": 0.0, - "routers_loss": 0.00573746208101511, + "routers_loss": 0.005348859820514917, "skip_count": 2.0, "step": 3068, "text_loss": 0.21612997353076935 @@ -29163,13 +29163,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.0008500895769465972, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4955023.0, "repeat_count": 0.0, - "routers_loss": 0.0012014979729428887, + "routers_loss": 0.0013203793205320835, "skip_count": 0.0, "step": 3070, "text_loss": 0.9757798314094543 @@ -29182,13 +29182,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.0478515625, "learning_rate": 0.0008498685252440124, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4957600.0, "repeat_count": 0.0, - "routers_loss": 0.006400141399353743, + "routers_loss": 0.006907356437295675, "skip_count": 0.0, "step": 3072, "text_loss": 0.356107234954834 @@ -29201,13 +29201,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.061279296875, "learning_rate": 0.0008496473394711487, - "loss": 0.0117, + "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4960746.0, "repeat_count": 0.0, - "routers_loss": 0.0030972862150520086, + "routers_loss": 0.0027704904787242413, "skip_count": 1.0, "step": 3074, "text_loss": 0.6812908053398132 @@ -29220,13 +29220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.0576171875, "learning_rate": 0.0008494260197127649, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 4963845.0, "repeat_count": 0.0, - "routers_loss": 0.004087577573955059, + "routers_loss": 0.0036796489730477333, "skip_count": 2.0, "step": 3076, "text_loss": 0.7215370535850525 @@ -29239,13 +29239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.0556640625, "learning_rate": 0.0008492045660536712, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4966887.0, "repeat_count": 0.0, - "routers_loss": 0.003797230776399374, + "routers_loss": 0.0037137691397219896, "skip_count": 1.0, "step": 3078, "text_loss": 0.8700299859046936 @@ -29258,13 +29258,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.036865234375, + "grad_norm": 0.03857421875, "learning_rate": 0.0008489829785787291, - "loss": 0.0081, + "loss": 0.0078, "macro_f1": 0.8823530077934265, "num_tokens": 4969859.0, "repeat_count": 1.0, - "routers_loss": 0.020377423614263535, + "routers_loss": 0.016492314636707306, "skip_count": 2.0, "step": 3080, "text_loss": 0.6520360112190247 @@ -29277,13 +29277,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.043701171875, "learning_rate": 0.0008487612573728513, - "loss": 0.0096, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4972628.0, "repeat_count": 0.0, - "routers_loss": 0.003695295425131917, + "routers_loss": 0.004022917244583368, "skip_count": 2.0, "step": 3082, "text_loss": 0.17498187720775604 @@ -29296,13 +29296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008485394025210016, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4975475.0, "repeat_count": 0.0, - "routers_loss": 0.008704355917870998, + "routers_loss": 0.009141159243881702, "skip_count": 1.0, "step": 3084, "text_loss": 0.5975366234779358 @@ -29315,13 +29315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.045166015625, "learning_rate": 0.0008483174141081956, - "loss": 0.0111, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4978858.0, "repeat_count": 0.0, - "routers_loss": 0.0031532018911093473, + "routers_loss": 0.0031561285723000765, "skip_count": 0.0, "step": 3086, "text_loss": 0.18748866021633148 @@ -29334,13 +29334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.04150390625, "learning_rate": 0.0008480952922194991, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4982142.0, "repeat_count": 0.0, - "routers_loss": 0.0007620530668646097, + "routers_loss": 0.0007894713780842721, "skip_count": 0.0, "step": 3088, "text_loss": 0.42083197832107544 @@ -29353,13 +29353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.0419921875, "learning_rate": 0.0008478730369400302, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4984872.0, "repeat_count": 0.0, - "routers_loss": 0.000692489615175873, + "routers_loss": 0.0005908289458602667, "skip_count": 0.0, "step": 3090, "text_loss": 0.45337188243865967 @@ -29372,13 +29372,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.02392578125, "learning_rate": 0.0008476506483549573, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4988137.0, "repeat_count": 1.0, - "routers_loss": 0.001856967923231423, + "routers_loss": 0.0016509373672306538, "skip_count": 2.0, "step": 3092, "text_loss": 0.6397262811660767 @@ -29391,13 +29391,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.036865234375, "learning_rate": 0.0008474281265495002, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4991164.0, "repeat_count": 0.0, - "routers_loss": 0.004027622286230326, + "routers_loss": 0.004088304936885834, "skip_count": 1.0, "step": 3094, "text_loss": 0.18352322280406952 @@ -29410,32 +29410,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.0380859375, "learning_rate": 0.0008472054716089295, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4993876.0, "repeat_count": 0.0, - "routers_loss": 0.004844399634748697, + "routers_loss": 0.005200014915317297, "skip_count": 0.0, "step": 3096, "text_loss": 0.2776511013507843 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.544760786615791, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0322265625, "learning_rate": 0.0008469826836185673, "loss": 0.01, - "macro_f1": 1.0, + "macro_f1": 0.6601307392120361, "num_tokens": 4997068.0, "repeat_count": 1.0, - "routers_loss": 0.012379852123558521, + "routers_loss": 0.012686059810221195, "skip_count": 2.0, "step": 3098, "text_loss": 0.23209233582019806 @@ -29448,13 +29448,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.055419921875, "learning_rate": 0.0008467597626637858, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 5000038.0, "repeat_count": 1.0, - "routers_loss": 0.00575951999053359, + "routers_loss": 0.006401528604328632, "skip_count": 2.0, "step": 3100, "text_loss": 0.45936745405197144 @@ -29467,13 +29467,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008465367088300093, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 5002870.0, "repeat_count": 0.0, - "routers_loss": 0.013157932087779045, + "routers_loss": 0.016640547662973404, "skip_count": 1.0, "step": 3102, "text_loss": 0.44502779841423035 @@ -29486,13 +29486,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0272216796875, "learning_rate": 0.0008463135222027124, - "loss": 0.0052, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5006357.0, "repeat_count": 0.0, - "routers_loss": 0.008679390884935856, + "routers_loss": 0.008411331102252007, "skip_count": 2.0, "step": 3104, "text_loss": 0.3414570391178131 @@ -29505,13 +29505,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03076171875, "learning_rate": 0.0008460902028674204, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5009059.0, "repeat_count": 0.0, - "routers_loss": 0.001076352084055543, + "routers_loss": 0.0010406570509076118, "skip_count": 0.0, "step": 3106, "text_loss": 0.5931221842765808 @@ -29524,13 +29524,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.0322265625, "learning_rate": 0.0008458667509097098, - "loss": 0.0112, + "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5012327.0, "repeat_count": 0.0, - "routers_loss": 0.0021328055299818516, + "routers_loss": 0.001959054498001933, "skip_count": 0.0, "step": 3108, "text_loss": 0.5191171169281006 @@ -29543,13 +29543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06640625, "learning_rate": 0.0008456431664152078, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 5015472.0, "repeat_count": 0.0, - "routers_loss": 0.0010206506121903658, + "routers_loss": 0.000994380097836256, "skip_count": 0.0, "step": 3110, "text_loss": 0.4455361068248749 @@ -29562,13 +29562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0263671875, + "grad_norm": 0.0264892578125, "learning_rate": 0.0008454194494695923, - "loss": 0.0111, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 5018901.0, "repeat_count": 0.0, - "routers_loss": 0.0041310288943350315, + "routers_loss": 0.0037662344984710217, "skip_count": 0.0, "step": 3112, "text_loss": 0.5335362553596497 @@ -29581,13 +29581,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.02294921875, "learning_rate": 0.0008451956001585923, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5022520.0, "repeat_count": 0.0, - "routers_loss": 0.00994859915226698, + "routers_loss": 0.008664715103805065, "skip_count": 3.0, "step": 3114, "text_loss": 0.16230148077011108 @@ -29600,13 +29600,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.0498046875, "learning_rate": 0.000844971618567987, - "loss": 0.0087, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 5025505.0, "repeat_count": 0.0, - "routers_loss": 0.0016823343466967344, + "routers_loss": 0.0015904927859082818, "skip_count": 0.0, "step": 3116, "text_loss": 0.6989432573318481 @@ -29619,13 +29619,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.033935546875, "learning_rate": 0.0008447475047836068, - "loss": 0.0061, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 5028767.0, "repeat_count": 0.0, - "routers_loss": 0.005725692491978407, + "routers_loss": 0.005853322334587574, "skip_count": 1.0, "step": 3118, "text_loss": 0.31420737504959106 @@ -29638,13 +29638,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008445232588913325, - "loss": 0.0116, + "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 5032577.0, "repeat_count": 0.0, - "routers_loss": 0.016534095630049706, + "routers_loss": 0.012760105542838573, "skip_count": 0.0, "step": 3120, "text_loss": 0.5534627437591553 @@ -29657,13 +29657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.049072265625, "learning_rate": 0.0008442988809770953, - "loss": 0.0097, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 5035381.0, "repeat_count": 0.0, - "routers_loss": 0.0023590524215251207, + "routers_loss": 0.0022257440723478794, "skip_count": 0.0, "step": 3122, "text_loss": 0.42492759227752686 @@ -29676,13 +29676,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03955078125, "learning_rate": 0.0008440743711268775, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5038743.0, "repeat_count": 0.0, - "routers_loss": 0.004739012103527784, + "routers_loss": 0.004648433532565832, "skip_count": 0.0, "step": 3124, "text_loss": 0.16404685378074646 @@ -29695,13 +29695,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.03955078125, "learning_rate": 0.0008438497294267117, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5041492.0, "repeat_count": 0.0, - "routers_loss": 0.006212939508259296, + "routers_loss": 0.006313877180218697, "skip_count": 0.0, "step": 3126, "text_loss": 0.23191484808921814 @@ -29714,13 +29714,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.07666015625, "learning_rate": 0.0008436249559626807, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5043955.0, "repeat_count": 1.0, - "routers_loss": 0.0036408400628715754, + "routers_loss": 0.0036270488053560257, "skip_count": 0.0, "step": 3128, "text_loss": 0.5782018303871155 @@ -29733,13 +29733,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04345703125, "learning_rate": 0.0008434000508209187, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5047571.0, "repeat_count": 0.0, - "routers_loss": 0.0038875883910804987, + "routers_loss": 0.003809858812019229, "skip_count": 1.0, "step": 3130, "text_loss": 0.7129825949668884 @@ -29752,13 +29752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.03955078125, "learning_rate": 0.0008431750140876092, - "loss": 0.0129, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 5051608.0, "repeat_count": 0.0, - "routers_loss": 0.002172809559851885, + "routers_loss": 0.0022369057405740023, "skip_count": 0.0, "step": 3132, "text_loss": 0.4433445930480957 @@ -29773,11 +29773,11 @@ "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.000842949845848987, - "loss": 0.0134, + "loss": 0.0135, "macro_f1": 0.32098764181137085, "num_tokens": 5054656.0, "repeat_count": 0.0, - "routers_loss": 0.04427836462855339, + "routers_loss": 0.0425117202103138, "skip_count": 2.0, "step": 3134, "text_loss": 0.38721024990081787 @@ -29790,13 +29790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0712890625, "learning_rate": 0.0008427245461913368, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 5059108.0, "repeat_count": 0.0, - "routers_loss": 0.0016648605233058333, + "routers_loss": 0.0018077283166348934, "skip_count": 0.0, "step": 3136, "text_loss": 0.7496368885040283 @@ -29809,13 +29809,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.12109375, "learning_rate": 0.0008424991152009941, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 1.0, "num_tokens": 5062371.0, "repeat_count": 1.0, - "routers_loss": 0.008457986637949944, + "routers_loss": 0.008801834657788277, "skip_count": 2.0, "step": 3138, "text_loss": 0.5337086319923401 @@ -29828,13 +29828,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.04296875, "learning_rate": 0.0008422735529643444, - "loss": 0.0099, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5065593.0, "repeat_count": 0.0, - "routers_loss": 0.004939604084938765, + "routers_loss": 0.00548676960170269, "skip_count": 3.0, "step": 3140, "text_loss": 0.2561623156070709 @@ -29847,13 +29847,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.032958984375, "learning_rate": 0.0008420478595678233, - "loss": 0.0077, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5068271.0, "repeat_count": 0.0, - "routers_loss": 0.006254551466554403, + "routers_loss": 0.006389956455677748, "skip_count": 0.0, "step": 3142, "text_loss": 0.15605193376541138 @@ -29866,13 +29866,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.07958984375, "learning_rate": 0.0008418220350979175, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 5071358.0, "repeat_count": 1.0, - "routers_loss": 0.01132921315729618, + "routers_loss": 0.012387622147798538, "skip_count": 2.0, "step": 3144, "text_loss": 0.3085838258266449 @@ -29885,13 +29885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.033447265625, "learning_rate": 0.0008415960796411628, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5075584.0, "repeat_count": 0.0, - "routers_loss": 0.0026424501556903124, + "routers_loss": 0.00311864772811532, "skip_count": 1.0, "step": 3146, "text_loss": 0.4786977469921112 @@ -29904,13 +29904,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1591796875, "learning_rate": 0.0008413699932841461, - "loss": 0.0093, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5078388.0, "repeat_count": 0.0, - "routers_loss": 0.0036633017007261515, + "routers_loss": 0.0030679800547659397, "skip_count": 0.0, "step": 3148, "text_loss": 0.5222916603088379 @@ -29923,13 +29923,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.0390625, "learning_rate": 0.0008411437761135039, - "loss": 0.0112, + "loss": 0.011, "macro_f1": 1.0, "num_tokens": 5081584.0, "repeat_count": 1.0, - "routers_loss": 0.012777967378497124, + "routers_loss": 0.012907958589494228, "skip_count": 2.0, "step": 3150, "text_loss": 0.5369884371757507 @@ -29942,13 +29942,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.03759765625, "learning_rate": 0.0008409174282159232, - "loss": 0.0074, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5084450.0, "repeat_count": 0.0, - "routers_loss": 0.013694444671273232, + "routers_loss": 0.012314042076468468, "skip_count": 2.0, "step": 3152, "text_loss": 0.25685277581214905 @@ -29961,13 +29961,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.041015625, "learning_rate": 0.000840690949678141, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5087865.0, "repeat_count": 1.0, - "routers_loss": 0.008412595838308334, + "routers_loss": 0.00899206381291151, "skip_count": 0.0, "step": 3154, "text_loss": 0.1717093288898468 @@ -29980,13 +29980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.06103515625, "learning_rate": 0.0008404643405869441, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5090857.0, "repeat_count": 0.0, - "routers_loss": 0.0011648585787042975, + "routers_loss": 0.0013312003575265408, "skip_count": 0.0, "step": 3156, "text_loss": 0.27446436882019043 @@ -29999,13 +29999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1533203125, "learning_rate": 0.0008402376010291695, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 5093917.0, "repeat_count": 0.0, - "routers_loss": 0.002915408927947283, + "routers_loss": 0.002653320087119937, "skip_count": 0.0, "step": 3158, "text_loss": 0.4237489402294159 @@ -30018,13 +30018,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.045654296875, "learning_rate": 0.0008400107310917045, - "loss": 0.0096, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5096656.0, "repeat_count": 0.0, - "routers_loss": 0.013139770366251469, + "routers_loss": 0.012976993806660175, "skip_count": 2.0, "step": 3160, "text_loss": 0.42361980676651 @@ -30037,13 +30037,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.0634765625, "learning_rate": 0.000839783730861486, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5099582.0, "repeat_count": 0.0, - "routers_loss": 0.0070426687598228455, + "routers_loss": 0.006936746649444103, "skip_count": 2.0, "step": 3162, "text_loss": 0.26656073331832886 @@ -30056,13 +30056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.05908203125, "learning_rate": 0.0008395566004255008, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 5102908.0, "repeat_count": 0.0, - "routers_loss": 0.006271707359701395, + "routers_loss": 0.006619359832257032, "skip_count": 1.0, "step": 3164, "text_loss": 0.590774416923523 @@ -30075,13 +30075,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0008393293398707858, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5105829.0, "repeat_count": 0.0, - "routers_loss": 0.010571467690169811, + "routers_loss": 0.010120268911123276, "skip_count": 2.0, "step": 3166, "text_loss": 0.605930507183075 @@ -30094,13 +30094,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.0419921875, "learning_rate": 0.0008391019492844275, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5109850.0, "repeat_count": 0.0, - "routers_loss": 0.005877034272998571, + "routers_loss": 0.004940980114042759, "skip_count": 2.0, "step": 3168, "text_loss": 0.12973152101039886 @@ -30115,11 +30115,11 @@ "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008388744287535627, - "loss": 0.0093, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5113353.0, "repeat_count": 0.0, - "routers_loss": 0.0031909283716231585, + "routers_loss": 0.0031777634285390377, "skip_count": 1.0, "step": 3170, "text_loss": 0.18577200174331665 @@ -30132,13 +30132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.052734375, "learning_rate": 0.0008386467783653775, - "loss": 0.0104, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 5116421.0, "repeat_count": 0.0, - "routers_loss": 0.005338824819773436, + "routers_loss": 0.005431659985333681, "skip_count": 0.0, "step": 3172, "text_loss": 0.2302747517824173 @@ -30151,13 +30151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.046142578125, "learning_rate": 0.000838418998207108, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5119457.0, "repeat_count": 0.0, - "routers_loss": 0.008522412739694118, + "routers_loss": 0.0077286697924137115, "skip_count": 4.0, "step": 3174, "text_loss": 0.19606637954711914 @@ -30170,13 +30170,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.050537109375, "learning_rate": 0.0008381910883660399, - "loss": 0.0068, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5123201.0, "repeat_count": 0.0, - "routers_loss": 0.0035330590326339006, + "routers_loss": 0.003982985392212868, "skip_count": 0.0, "step": 3176, "text_loss": 0.716376006603241 @@ -30189,13 +30189,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09375, + "grad_norm": 0.09423828125, "learning_rate": 0.0008379630489295089, - "loss": 0.0106, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5126035.0, "repeat_count": 0.0, - "routers_loss": 0.006332095246762037, + "routers_loss": 0.005626026075333357, "skip_count": 1.0, "step": 3178, "text_loss": 0.5144625902175903 @@ -30208,13 +30208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05859375, + "grad_norm": 0.05615234375, "learning_rate": 0.0008377348799849, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5129179.0, "repeat_count": 0.0, - "routers_loss": 0.017295993864536285, + "routers_loss": 0.015458245761692524, "skip_count": 2.0, "step": 3180, "text_loss": 0.29887503385543823 @@ -30227,13 +30227,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0703125, + "grad_norm": 0.062255859375, "learning_rate": 0.0008375065816196479, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 5132149.0, "repeat_count": 0.0, - "routers_loss": 0.017241213470697403, + "routers_loss": 0.012210468761622906, "skip_count": 2.0, "step": 3182, "text_loss": 0.8981851935386658 @@ -30246,13 +30246,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.044677734375, "learning_rate": 0.0008372781539212371, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5135287.0, "repeat_count": 0.0, - "routers_loss": 0.00516276340931654, + "routers_loss": 0.0052537876181304455, "skip_count": 0.0, "step": 3184, "text_loss": 0.4245666563510895 @@ -30265,13 +30265,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0240478515625, "learning_rate": 0.0008370495969772014, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5138589.0, "repeat_count": 0.0, - "routers_loss": 0.012517380528151989, + "routers_loss": 0.012873421423137188, "skip_count": 2.0, "step": 3186, "text_loss": 0.40581050515174866 @@ -30284,13 +30284,13 @@ "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.07470703125, "learning_rate": 0.0008368209108751244, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.6521739363670349, "num_tokens": 5141635.0, "repeat_count": 2.0, - "routers_loss": 0.0810512825846672, + "routers_loss": 0.07720445841550827, "skip_count": 4.0, "step": 3188, "text_loss": 0.3755173981189728 @@ -30303,13 +30303,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.01953125, + "grad_norm": 0.02197265625, "learning_rate": 0.0008365920957026389, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5144728.0, "repeat_count": 0.0, - "routers_loss": 0.0014350182609632611, + "routers_loss": 0.001440995605662465, "skip_count": 0.0, "step": 3190, "text_loss": 0.5067034363746643 @@ -30322,13 +30322,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.041748046875, "learning_rate": 0.0008363631515474275, - "loss": 0.0091, + "loss": 0.0089, "macro_f1": 0.6538461446762085, "num_tokens": 5147963.0, "repeat_count": 1.0, - "routers_loss": 0.018022676929831505, + "routers_loss": 0.018752984702587128, "skip_count": 2.0, "step": 3192, "text_loss": 0.20224551856517792 @@ -30341,13 +30341,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.037353515625, "learning_rate": 0.0008361340784972217, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5151184.0, "repeat_count": 0.0, - "routers_loss": 0.0005097229732200503, + "routers_loss": 0.0005360354552976787, "skip_count": 0.0, "step": 3194, "text_loss": 0.4588058292865753 @@ -30360,13 +30360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.0390625, "learning_rate": 0.0008359048766398031, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5153889.0, "repeat_count": 0.0, - "routers_loss": 0.0009840037673711777, + "routers_loss": 0.0009184491937048733, "skip_count": 1.0, "step": 3196, "text_loss": 0.2980220317840576 @@ -30379,13 +30379,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02685546875, + "grad_norm": 0.027099609375, "learning_rate": 0.000835675546063002, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5156758.0, "repeat_count": 0.0, - "routers_loss": 0.001269801170565188, + "routers_loss": 0.001252970308996737, "skip_count": 0.0, "step": 3198, "text_loss": 0.6775755882263184 @@ -30398,13 +30398,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.052490234375, "learning_rate": 0.0008354460868546985, - "loss": 0.0071, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5160247.0, "repeat_count": 0.0, - "routers_loss": 0.0034889329690486193, + "routers_loss": 0.0037315806839615107, "skip_count": 0.0, "step": 3200, "text_loss": 0.35867011547088623 @@ -30417,13 +30417,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.034912109375, "learning_rate": 0.0008352164991028217, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 5163456.0, "repeat_count": 1.0, - "routers_loss": 0.001520772697404027, + "routers_loss": 0.001497485558502376, "skip_count": 0.0, "step": 3202, "text_loss": 0.690290093421936 @@ -30436,13 +30436,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.04638671875, "learning_rate": 0.0008349867828953501, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 5166139.0, "repeat_count": 0.0, - "routers_loss": 0.0011800233041867614, + "routers_loss": 0.001051135826855898, "skip_count": 0.0, "step": 3204, "text_loss": 0.3340415954589844 @@ -30455,13 +30455,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.03076171875, "learning_rate": 0.0008347569383203113, - "loss": 0.01, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5169009.0, "repeat_count": 0.0, - "routers_loss": 0.001043233904056251, + "routers_loss": 0.0010544003453105688, "skip_count": 0.0, "step": 3206, "text_loss": 0.8584878444671631 @@ -30474,13 +30474,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03662109375, "learning_rate": 0.0008345269654657823, - "loss": 0.0084, + "loss": 0.0085, "macro_f1": 1.0, "num_tokens": 5172618.0, "repeat_count": 1.0, - "routers_loss": 0.007460868917405605, + "routers_loss": 0.007312417030334473, "skip_count": 1.0, "step": 3208, "text_loss": 0.19500218331813812 @@ -30493,13 +30493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.03466796875, "learning_rate": 0.0008342968644198892, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 5175857.0, "repeat_count": 0.0, - "routers_loss": 0.0027419133111834526, + "routers_loss": 0.00276504410430789, "skip_count": 0.0, "step": 3210, "text_loss": 0.5446314215660095 @@ -30512,13 +30512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.037109375, "learning_rate": 0.0008340666352708068, - "loss": 0.0089, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5178585.0, "repeat_count": 0.0, - "routers_loss": 0.002764733275398612, + "routers_loss": 0.002669303445145488, "skip_count": 0.0, "step": 3212, "text_loss": 0.3687484860420227 @@ -30531,13 +30531,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.035888671875, "learning_rate": 0.0008338362781067596, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5181777.0, "repeat_count": 0.0, - "routers_loss": 0.0032288613729178905, + "routers_loss": 0.0031585274264216423, "skip_count": 0.0, "step": 3214, "text_loss": 0.27325859665870667 @@ -30550,13 +30550,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.04541015625, "learning_rate": 0.000833605793016021, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 5184312.0, "repeat_count": 0.0, - "routers_loss": 0.008322423323988914, + "routers_loss": 0.008807534351944923, "skip_count": 2.0, "step": 3216, "text_loss": 0.4466548562049866 @@ -30569,13 +30569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008333751800869133, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5187497.0, "repeat_count": 0.0, - "routers_loss": 0.0034384531900286674, + "routers_loss": 0.003171310294419527, "skip_count": 0.0, "step": 3218, "text_loss": 0.5423526763916016 @@ -30588,13 +30588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.025634765625, "learning_rate": 0.0008331444394078076, - "loss": 0.0081, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5190982.0, "repeat_count": 0.0, - "routers_loss": 0.0015023534651845694, + "routers_loss": 0.0016481258207932115, "skip_count": 2.0, "step": 3220, "text_loss": 0.48984917998313904 @@ -30607,13 +30607,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03271484375, "learning_rate": 0.000832913571067124, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 5194044.0, "repeat_count": 1.0, - "routers_loss": 0.0043489462696015835, + "routers_loss": 0.003957313951104879, "skip_count": 1.0, "step": 3222, "text_loss": 0.4533331096172333 @@ -30626,13 +30626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.040283203125, "learning_rate": 0.0008326825751533322, - "loss": 0.0076, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5197092.0, "repeat_count": 0.0, - "routers_loss": 0.0012065734481438994, + "routers_loss": 0.0016904744552448392, "skip_count": 0.0, "step": 3224, "text_loss": 0.5538802742958069 @@ -30645,13 +30645,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.05224609375, "learning_rate": 0.0008324514517549501, - "loss": 0.0084, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5199941.0, "repeat_count": 0.0, - "routers_loss": 0.006849290337413549, + "routers_loss": 0.005608258303254843, "skip_count": 1.0, "step": 3226, "text_loss": 0.416242778301239 @@ -30664,32 +30664,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.03857421875, + "grad_norm": 0.040771484375, "learning_rate": 0.0008322202009605444, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.8823530077934265, "num_tokens": 5202618.0, "repeat_count": 1.0, - "routers_loss": 0.020665202289819717, + "routers_loss": 0.020965175703167915, "skip_count": 2.0, "step": 3228, "text_loss": 0.17496295273303986 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 15.164367478720282, - "f1_execute": 0.9777777791023254, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008319888228587311, "loss": 0.0063, - "macro_f1": 0.6592592597007751, + "macro_f1": 1.0, "num_tokens": 5206414.0, "repeat_count": 1.0, - "routers_loss": 0.026284674182534218, + "routers_loss": 0.021259209141135216, "skip_count": 5.0, "step": 3230, "text_loss": 0.22471418976783752 @@ -30702,13 +30702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03076171875, + "grad_norm": 0.029541015625, "learning_rate": 0.0008317573175381745, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5209768.0, "repeat_count": 0.0, - "routers_loss": 0.0018494570394977927, + "routers_loss": 0.0018647604156285524, "skip_count": 0.0, "step": 3232, "text_loss": 0.4415269196033478 @@ -30721,13 +30721,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0283203125, "learning_rate": 0.0008315256850875881, - "loss": 0.0061, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5213257.0, "repeat_count": 0.0, - "routers_loss": 0.002610588213428855, + "routers_loss": 0.002345515415072441, "skip_count": 0.0, "step": 3234, "text_loss": 0.347247838973999 @@ -30740,13 +30740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048828125, + "grad_norm": 0.053955078125, "learning_rate": 0.0008312939255957336, - "loss": 0.0084, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5215800.0, "repeat_count": 0.0, - "routers_loss": 0.007061914075165987, + "routers_loss": 0.007112892810255289, "skip_count": 3.0, "step": 3236, "text_loss": 0.31091734766960144 @@ -30759,13 +30759,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.033203125, "learning_rate": 0.0008310620391514219, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5219205.0, "repeat_count": 0.0, - "routers_loss": 0.004094691481441259, + "routers_loss": 0.00432228296995163, "skip_count": 0.0, "step": 3238, "text_loss": 0.3421775996685028 @@ -30778,13 +30778,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.027099609375, "learning_rate": 0.0008308300258435124, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 5222422.0, "repeat_count": 0.0, - "routers_loss": 0.007662596181035042, + "routers_loss": 0.0076514314860105515, "skip_count": 2.0, "step": 3240, "text_loss": 0.22378318011760712 @@ -30797,13 +30797,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0264892578125, + "grad_norm": 0.028564453125, "learning_rate": 0.0008305978857609128, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5225625.0, "repeat_count": 0.0, - "routers_loss": 0.0008108283509500325, + "routers_loss": 0.0007617069641128182, "skip_count": 0.0, "step": 3242, "text_loss": 0.5880323648452759 @@ -30816,13 +30816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.02734375, "learning_rate": 0.0008303656189925799, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5229113.0, "repeat_count": 0.0, - "routers_loss": 0.0018137742299586535, + "routers_loss": 0.0017418119823560119, "skip_count": 0.0, "step": 3244, "text_loss": 0.3302813768386841 @@ -30835,13 +30835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.042724609375, "learning_rate": 0.0008301332256275183, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5232061.0, "repeat_count": 0.0, - "routers_loss": 0.0025301240384578705, + "routers_loss": 0.0026667986530810595, "skip_count": 0.0, "step": 3246, "text_loss": 0.5679706335067749 @@ -30854,13 +30854,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.058349609375, "learning_rate": 0.0008299007057547821, - "loss": 0.0101, + "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5235279.0, "repeat_count": 1.0, - "routers_loss": 0.011231686919927597, + "routers_loss": 0.011016624979674816, "skip_count": 2.0, "step": 3248, "text_loss": 0.5081504583358765 @@ -30873,13 +30873,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.033203125, "learning_rate": 0.0008296680594634731, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5239655.0, "repeat_count": 1.0, - "routers_loss": 0.005881415214389563, + "routers_loss": 0.005492044147104025, "skip_count": 0.0, "step": 3250, "text_loss": 0.14675180613994598 @@ -30892,13 +30892,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.0269775390625, "learning_rate": 0.0008294352868427418, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5243579.0, "repeat_count": 0.0, - "routers_loss": 0.004495301283895969, + "routers_loss": 0.00404445780441165, "skip_count": 1.0, "step": 3252, "text_loss": 0.4201085865497589 @@ -30911,13 +30911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0208740234375, + "grad_norm": 0.0242919921875, "learning_rate": 0.0008292023879817871, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5247059.0, "repeat_count": 0.0, - "routers_loss": 0.007394428364932537, + "routers_loss": 0.006886140909045935, "skip_count": 1.0, "step": 3254, "text_loss": 0.2289208322763443 @@ -30930,32 +30930,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.057861328125, "learning_rate": 0.0008289693629698564, - "loss": 0.0077, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5249940.0, "repeat_count": 0.0, - "routers_loss": 0.0006736332434229553, + "routers_loss": 0.0005736657767556608, "skip_count": 0.0, "step": 3256, "text_loss": 0.5670450925827026 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 15.295861461696507, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.0224609375, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, "learning_rate": 0.0008287362118962452, - "loss": 0.0062, - "macro_f1": 0.6666666865348816, + "loss": 0.006, + "macro_f1": 0.3272727429866791, "num_tokens": 5253580.0, "repeat_count": 0.0, - "routers_loss": 0.009847268462181091, + "routers_loss": 0.011349895037710667, "skip_count": 1.0, "step": 3258, "text_loss": 0.5042323470115662 @@ -30968,13 +30968,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0267333984375, "learning_rate": 0.0008285029348502973, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5257080.0, "repeat_count": 0.0, - "routers_loss": 0.0013670918997377157, + "routers_loss": 0.0013626761501654983, "skip_count": 0.0, "step": 3260, "text_loss": 0.3227672874927521 @@ -30987,13 +30987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0008282695319214053, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5259951.0, "repeat_count": 0.0, - "routers_loss": 0.004696785472333431, + "routers_loss": 0.00471635302528739, "skip_count": 0.0, "step": 3262, "text_loss": 0.20773714780807495 @@ -31006,13 +31006,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008280360031990093, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 5263314.0, "repeat_count": 0.0, - "routers_loss": 0.010588239878416061, + "routers_loss": 0.010472415015101433, "skip_count": 2.0, "step": 3264, "text_loss": 0.34397366642951965 @@ -31025,13 +31025,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.036865234375, "learning_rate": 0.000827802348772598, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5267358.0, "repeat_count": 0.0, - "routers_loss": 0.0010326795745640993, + "routers_loss": 0.0007814752752892673, "skip_count": 0.0, "step": 3266, "text_loss": 0.747342586517334 @@ -31044,13 +31044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.0498046875, "learning_rate": 0.0008275685687317084, - "loss": 0.0087, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5270400.0, "repeat_count": 0.0, - "routers_loss": 0.0010199147509410977, + "routers_loss": 0.000902949133887887, "skip_count": 0.0, "step": 3268, "text_loss": 0.43782034516334534 @@ -31063,13 +31063,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03564453125, "learning_rate": 0.0008273346631659252, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5273147.0, "repeat_count": 0.0, - "routers_loss": 0.00046372212818823755, + "routers_loss": 0.00043462219764478505, "skip_count": 0.0, "step": 3270, "text_loss": 0.6358205080032349 @@ -31082,13 +31082,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.04052734375, "learning_rate": 0.0008271006321648816, - "loss": 0.0088, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5277638.0, "repeat_count": 0.0, - "routers_loss": 0.0022951713763177395, + "routers_loss": 0.002211218234151602, "skip_count": 0.0, "step": 3272, "text_loss": 0.20220105350017548 @@ -31101,13 +31101,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.04638671875, "learning_rate": 0.0008268664758182589, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5280638.0, "repeat_count": 1.0, - "routers_loss": 0.008325734175741673, + "routers_loss": 0.010536720044910908, "skip_count": 0.0, "step": 3274, "text_loss": 0.7579061388969421 @@ -31120,32 +31120,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.0439453125, "learning_rate": 0.0008266321942157859, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5283847.0, "repeat_count": 0.0, - "routers_loss": 0.0017014809418469667, + "routers_loss": 0.0017158017726615071, "skip_count": 0.0, "step": 3276, "text_loss": 0.669302761554718 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.800000011920929, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 15.389785735250953, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.06005859375, "learning_rate": 0.0008263977874472399, - "loss": 0.0089, - "macro_f1": 1.0, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, "num_tokens": 5286627.0, "repeat_count": 5.0, - "routers_loss": 0.009527196176350117, + "routers_loss": 0.011220700107514858, "skip_count": 4.0, "step": 3278, "text_loss": 0.8703984022140503 @@ -31158,13 +31158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.05615234375, "learning_rate": 0.0008261632556024461, - "loss": 0.01, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5289766.0, "repeat_count": 0.0, - "routers_loss": 0.0025269081816077232, + "routers_loss": 0.0020442772656679153, "skip_count": 0.0, "step": 3280, "text_loss": 0.5009346008300781 @@ -31177,13 +31177,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.10107421875, "learning_rate": 0.0008259285987712774, - "loss": 0.0108, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5293010.0, "repeat_count": 0.0, - "routers_loss": 0.005710822530090809, + "routers_loss": 0.005645765457302332, "skip_count": 0.0, "step": 3282, "text_loss": 0.2546011209487915 @@ -31196,13 +31196,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.042236328125, "learning_rate": 0.0008256938170436549, - "loss": 0.0114, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5296732.0, "repeat_count": 0.0, - "routers_loss": 0.0028946297243237495, + "routers_loss": 0.0027385836001485586, "skip_count": 2.0, "step": 3284, "text_loss": 0.5244000554084778 @@ -31217,11 +31217,11 @@ "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008254589105095473, - "loss": 0.0059, + "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 5299926.0, "repeat_count": 1.0, - "routers_loss": 0.007981270551681519, + "routers_loss": 0.007451715879142284, "skip_count": 1.0, "step": 3286, "text_loss": 0.28979742527008057 @@ -31234,13 +31234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0218505859375, "learning_rate": 0.0008252238792589711, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5303006.0, "repeat_count": 0.0, - "routers_loss": 0.005524218548089266, + "routers_loss": 0.004805843345820904, "skip_count": 2.0, "step": 3288, "text_loss": 0.5131978392601013 @@ -31253,13 +31253,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.038818359375, "learning_rate": 0.000824988723381991, - "loss": 0.0092, + "loss": 0.0091, "macro_f1": 0.3272727429866791, "num_tokens": 5306953.0, "repeat_count": 0.0, - "routers_loss": 0.01160401664674282, + "routers_loss": 0.010639613494277, "skip_count": 1.0, "step": 3290, "text_loss": 0.4901447296142578 @@ -31272,13 +31272,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.033935546875, + "grad_norm": 0.044189453125, "learning_rate": 0.0008247534429687191, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 5310516.0, "repeat_count": 0.0, - "routers_loss": 0.014068983495235443, + "routers_loss": 0.013625577092170715, "skip_count": 2.0, "step": 3292, "text_loss": 0.2124534696340561 @@ -31291,13 +31291,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.041748046875, "learning_rate": 0.0008245180381093152, - "loss": 0.0116, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 5313959.0, "repeat_count": 0.0, - "routers_loss": 0.00520911393687129, + "routers_loss": 0.004958513658493757, "skip_count": 1.0, "step": 3294, "text_loss": 0.46682238578796387 @@ -31310,13 +31310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008242825088939867, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5316609.0, "repeat_count": 0.0, - "routers_loss": 0.004490343388170004, + "routers_loss": 0.003962756600230932, "skip_count": 0.0, "step": 3296, "text_loss": 0.7010108232498169 @@ -31329,13 +31329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.04052734375, "learning_rate": 0.0008240468554129892, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5319638.0, "repeat_count": 0.0, - "routers_loss": 0.0006864524912089109, + "routers_loss": 0.0006996620795689523, "skip_count": 0.0, "step": 3298, "text_loss": 0.4966355860233307 @@ -31348,13 +31348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.0341796875, "learning_rate": 0.0008238110777566255, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 5323019.0, "repeat_count": 0.0, - "routers_loss": 0.0017158432165160775, + "routers_loss": 0.0016031896229833364, "skip_count": 0.0, "step": 3300, "text_loss": 0.38668957352638245 @@ -31367,13 +31367,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0303955078125, "learning_rate": 0.0008235751760152459, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5326099.0, "repeat_count": 2.0, - "routers_loss": 0.0037166383117437363, + "routers_loss": 0.00344281829893589, "skip_count": 2.0, "step": 3302, "text_loss": 0.5330720543861389 @@ -31386,13 +31386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.06005859375, "learning_rate": 0.0008233391502792484, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5328993.0, "repeat_count": 0.0, - "routers_loss": 0.008341175504028797, + "routers_loss": 0.007886730134487152, "skip_count": 1.0, "step": 3304, "text_loss": 0.5470269322395325 @@ -31405,13 +31405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.034423828125, "learning_rate": 0.0008231030006390786, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5331554.0, "repeat_count": 0.0, - "routers_loss": 0.008380163460969925, + "routers_loss": 0.008180000819265842, "skip_count": 1.0, "step": 3306, "text_loss": 0.4023340344429016 @@ -31424,13 +31424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.02587890625, "learning_rate": 0.0008228667271852294, - "loss": 0.0062, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5335712.0, "repeat_count": 0.0, - "routers_loss": 0.00030099941068328917, + "routers_loss": 0.0002942821884062141, "skip_count": 0.0, "step": 3308, "text_loss": 0.5306711792945862 @@ -31443,13 +31443,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.05908203125, "learning_rate": 0.0008226303300082414, - "loss": 0.0095, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5338701.0, "repeat_count": 0.0, - "routers_loss": 0.0006003376329317689, + "routers_loss": 0.0006134595023468137, "skip_count": 0.0, "step": 3310, "text_loss": 0.5906263589859009 @@ -31462,13 +31462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.02880859375, "learning_rate": 0.0008223938091987022, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5342274.0, "repeat_count": 0.0, - "routers_loss": 0.0017984671285375953, + "routers_loss": 0.0016656654188409448, "skip_count": 0.0, "step": 3312, "text_loss": 0.5201764106750488 @@ -31481,13 +31481,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.052001953125, "learning_rate": 0.0008221571648472472, - "loss": 0.0066, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5345185.0, "repeat_count": 0.0, - "routers_loss": 0.003994898404926062, + "routers_loss": 0.0038612703792750835, "skip_count": 0.0, "step": 3314, "text_loss": 0.36633720993995667 @@ -31500,13 +31500,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03369140625, "learning_rate": 0.0008219203970445589, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 5348804.0, "repeat_count": 0.0, - "routers_loss": 0.009415820240974426, + "routers_loss": 0.009782899171113968, "skip_count": 1.0, "step": 3316, "text_loss": 0.3117460012435913 @@ -31519,13 +31519,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.055908203125, "learning_rate": 0.0008216835058813672, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5351896.0, "repeat_count": 0.0, - "routers_loss": 0.006483082659542561, + "routers_loss": 0.007713229861110449, "skip_count": 0.0, "step": 3318, "text_loss": 0.253496378660202 @@ -31538,13 +31538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.03173828125, "learning_rate": 0.0008214464914484492, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5355058.0, "repeat_count": 0.0, - "routers_loss": 0.006275791209191084, + "routers_loss": 0.006227815989404917, "skip_count": 2.0, "step": 3320, "text_loss": 0.32693132758140564 @@ -31557,13 +31557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03271484375, "learning_rate": 0.0008212093538366292, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5358365.0, "repeat_count": 0.0, - "routers_loss": 0.0027182933408766985, + "routers_loss": 0.002601418411359191, "skip_count": 0.0, "step": 3322, "text_loss": 0.40394455194473267 @@ -31576,13 +31576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.031982421875, "learning_rate": 0.000820972093136779, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5360981.0, "repeat_count": 0.0, - "routers_loss": 0.005600054748356342, + "routers_loss": 0.005545300897210836, "skip_count": 3.0, "step": 3324, "text_loss": 0.6758295893669128 @@ -31595,13 +31595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05078125, "learning_rate": 0.0008207347094398172, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 5364018.0, "repeat_count": 1.0, - "routers_loss": 0.0020965971052646637, + "routers_loss": 0.001924700103700161, "skip_count": 0.0, "step": 3326, "text_loss": 0.5196860432624817 @@ -31614,13 +31614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.0299072265625, "learning_rate": 0.0008204972028367097, - "loss": 0.006, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5366986.0, "repeat_count": 0.0, - "routers_loss": 0.011729889549314976, + "routers_loss": 0.012254828587174416, "skip_count": 1.0, "step": 3328, "text_loss": 0.24661913514137268 @@ -31633,13 +31633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.038818359375, "learning_rate": 0.0008202595734184694, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5371463.0, "repeat_count": 0.0, - "routers_loss": 0.004913534037768841, + "routers_loss": 0.005094083491712809, "skip_count": 0.0, "step": 3330, "text_loss": 0.2525769770145416 @@ -31652,13 +31652,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.040283203125, "learning_rate": 0.0008200218212761566, - "loss": 0.0111, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5374823.0, "repeat_count": 1.0, - "routers_loss": 0.0028079606126993895, + "routers_loss": 0.0025883198250085115, "skip_count": 0.0, "step": 3332, "text_loss": 0.21849912405014038 @@ -31671,13 +31671,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.030029296875, "learning_rate": 0.000819783946500878, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5377640.0, "repeat_count": 0.0, - "routers_loss": 0.008404970169067383, + "routers_loss": 0.008240507915616035, "skip_count": 0.0, "step": 3334, "text_loss": 0.2662734091281891 @@ -31690,13 +31690,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.048583984375, + "grad_norm": 0.050537109375, "learning_rate": 0.000819545949183788, - "loss": 0.0101, + "loss": 0.01, "macro_f1": 0.5934640765190125, "num_tokens": 5380593.0, "repeat_count": 0.0, - "routers_loss": 0.040179044008255005, + "routers_loss": 0.038378193974494934, "skip_count": 3.0, "step": 3336, "text_loss": 0.2431795746088028 @@ -31709,13 +31709,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.040283203125, "learning_rate": 0.0008193078294160874, - "loss": 0.0096, + "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 5384487.0, "repeat_count": 1.0, - "routers_loss": 0.005122583359479904, + "routers_loss": 0.005926199723035097, "skip_count": 1.0, "step": 3338, "text_loss": 0.5663705468177795 @@ -31728,13 +31728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.032470703125, "learning_rate": 0.0008190695872890242, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5387511.0, "repeat_count": 0.0, - "routers_loss": 0.012232085689902306, + "routers_loss": 0.010842559859156609, "skip_count": 2.0, "step": 3340, "text_loss": 0.11517292261123657 @@ -31747,13 +31747,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029296875, + "grad_norm": 0.0283203125, "learning_rate": 0.0008188312228938933, - "loss": 0.009, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5390698.0, "repeat_count": 0.0, - "routers_loss": 0.0011168667115271091, + "routers_loss": 0.001304097007960081, "skip_count": 0.0, "step": 3342, "text_loss": 0.4827076196670532 @@ -31766,13 +31766,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008185927363220363, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5393778.0, "repeat_count": 1.0, - "routers_loss": 0.005202370695769787, + "routers_loss": 0.005354117136448622, "skip_count": 0.0, "step": 3344, "text_loss": 0.44467049837112427 @@ -31785,13 +31785,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.040771484375, "learning_rate": 0.0008183541276648418, - "loss": 0.0081, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5396925.0, "repeat_count": 0.0, - "routers_loss": 0.005000839475542307, + "routers_loss": 0.004800073802471161, "skip_count": 2.0, "step": 3346, "text_loss": 0.2032834142446518 @@ -31804,13 +31804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.027587890625, "learning_rate": 0.0008181153970137449, - "loss": 0.0059, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5400522.0, "repeat_count": 0.0, - "routers_loss": 0.0020684092305600643, + "routers_loss": 0.0021674633026123047, "skip_count": 0.0, "step": 3348, "text_loss": 0.4507528841495514 @@ -31823,13 +31823,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.051513671875, "learning_rate": 0.0008178765444602278, "loss": 0.0117, "macro_f1": 0.8820862174034119, "num_tokens": 5403526.0, "repeat_count": 2.0, - "routers_loss": 0.040753237903118134, + "routers_loss": 0.04263930395245552, "skip_count": 2.0, "step": 3350, "text_loss": 0.3606615960597992 @@ -31842,13 +31842,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.033447265625, "learning_rate": 0.0008176375700958194, - "loss": 0.0089, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5407127.0, "repeat_count": 1.0, - "routers_loss": 0.007767915725708008, + "routers_loss": 0.006953123956918716, "skip_count": 0.0, "step": 3352, "text_loss": 0.2290353775024414 @@ -31861,13 +31861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.035400390625, "learning_rate": 0.0008173984740120948, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5410829.0, "repeat_count": 0.0, - "routers_loss": 0.0016073459992185235, + "routers_loss": 0.0014363783411681652, "skip_count": 0.0, "step": 3354, "text_loss": 0.4220392405986786 @@ -31880,13 +31880,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.031982421875, "learning_rate": 0.0008171592563006762, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5414152.0, "repeat_count": 0.0, - "routers_loss": 0.0016132282325997949, + "routers_loss": 0.00202389364130795, "skip_count": 1.0, "step": 3356, "text_loss": 0.37729766964912415 @@ -31899,13 +31899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.041015625, "learning_rate": 0.0008169199170532323, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5417312.0, "repeat_count": 0.0, - "routers_loss": 0.007077203597873449, + "routers_loss": 0.006253739818930626, "skip_count": 2.0, "step": 3358, "text_loss": 0.1304289996623993 @@ -31918,13 +31918,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0008166804563614785, - "loss": 0.0088, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 5421227.0, "repeat_count": 2.0, - "routers_loss": 0.01628093235194683, + "routers_loss": 0.01622140221297741, "skip_count": 2.0, "step": 3360, "text_loss": 0.298664391040802 @@ -31937,13 +31937,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.024169921875, "learning_rate": 0.0008164408743171763, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5424646.0, "repeat_count": 1.0, - "routers_loss": 0.003795142285525799, + "routers_loss": 0.0037176944315433502, "skip_count": 2.0, "step": 3362, "text_loss": 0.12147632241249084 @@ -31956,13 +31956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.046630859375, "learning_rate": 0.0008162011710121339, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5427897.0, "repeat_count": 0.0, - "routers_loss": 0.0024164009373635054, + "routers_loss": 0.0020403533708304167, "skip_count": 1.0, "step": 3364, "text_loss": 0.2656533420085907 @@ -31975,32 +31975,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0008159613465382066, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5430474.0, "repeat_count": 0.0, - "routers_loss": 0.002314126119017601, + "routers_loss": 0.0018634048756211996, "skip_count": 0.0, "step": 3366, "text_loss": 0.9133086204528809 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 15.812444966245964, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.058837890625, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, "learning_rate": 0.0008157214009872951, - "loss": 0.008, - "macro_f1": 0.5492662787437439, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, "num_tokens": 5433113.0, "repeat_count": 0.0, - "routers_loss": 0.014630996622145176, + "routers_loss": 0.012944488786160946, "skip_count": 2.0, "step": 3368, "text_loss": 0.24352453649044037 @@ -32013,13 +32013,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05712890625, "learning_rate": 0.0008154813344513472, - "loss": 0.0141, + "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 5436259.0, "repeat_count": 0.0, - "routers_loss": 0.0023453824687749147, + "routers_loss": 0.002347963862121105, "skip_count": 2.0, "step": 3370, "text_loss": 0.7601244449615479 @@ -32032,13 +32032,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.031494140625, "learning_rate": 0.0008152411470223568, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5439126.0, "repeat_count": 0.0, - "routers_loss": 0.0015595925506204367, + "routers_loss": 0.0016609140438959002, "skip_count": 0.0, "step": 3372, "text_loss": 0.5551947355270386 @@ -32051,13 +32051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.04345703125, "learning_rate": 0.0008150008387923643, - "loss": 0.0067, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5442739.0, "repeat_count": 0.0, - "routers_loss": 0.008187411352992058, + "routers_loss": 0.008321396075189114, "skip_count": 0.0, "step": 3374, "text_loss": 0.25028282403945923 @@ -32070,13 +32070,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.08544921875, "learning_rate": 0.000814760409853456, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 5445247.0, "repeat_count": 2.0, - "routers_loss": 0.009705786593258381, + "routers_loss": 0.009738070890307426, "skip_count": 1.0, "step": 3376, "text_loss": 0.37271201610565186 @@ -32089,13 +32089,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.042236328125, "learning_rate": 0.0008145198602977651, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5449044.0, "repeat_count": 0.0, - "routers_loss": 0.003062802366912365, + "routers_loss": 0.0028421466704458, "skip_count": 0.0, "step": 3378, "text_loss": 0.1458655595779419 @@ -32108,13 +32108,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.11474609375, "learning_rate": 0.0008142791902174701, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 5453063.0, "repeat_count": 0.0, - "routers_loss": 0.001539172139018774, + "routers_loss": 0.0015170135302469134, "skip_count": 0.0, "step": 3380, "text_loss": 0.5548722743988037 @@ -32127,13 +32127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.031982421875, "learning_rate": 0.0008140383997047966, - "loss": 0.0082, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5455814.0, "repeat_count": 0.0, - "routers_loss": 0.002227923832833767, + "routers_loss": 0.0022444510832428932, "skip_count": 1.0, "step": 3382, "text_loss": 0.8034513592720032 @@ -32146,13 +32146,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03369140625, "learning_rate": 0.000813797488852016, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5459392.0, "repeat_count": 0.0, - "routers_loss": 0.0003921810712199658, + "routers_loss": 0.00038578867679461837, "skip_count": 0.0, "step": 3384, "text_loss": 0.6940088868141174 @@ -32165,13 +32165,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.045654296875, "learning_rate": 0.0008135564577514458, - "loss": 0.0116, + "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5462413.0, "repeat_count": 0.0, - "routers_loss": 0.001971066929399967, + "routers_loss": 0.0019727381877601147, "skip_count": 0.0, "step": 3386, "text_loss": 0.5124650597572327 @@ -32184,13 +32184,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.099609375, "learning_rate": 0.0008133153064954495, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 5465552.0, "repeat_count": 0.0, - "routers_loss": 0.0018206594977527857, + "routers_loss": 0.0019896167796105146, "skip_count": 0.0, "step": 3388, "text_loss": 0.4292517900466919 @@ -32203,13 +32203,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0008130740351764367, - "loss": 0.0068, + "loss": 0.007, "macro_f1": 1.0, "num_tokens": 5468573.0, "repeat_count": 1.0, - "routers_loss": 0.003323496552184224, + "routers_loss": 0.0030118159484118223, "skip_count": 1.0, "step": 3390, "text_loss": 0.48903173208236694 @@ -32222,13 +32222,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0216064453125, "learning_rate": 0.000812832643886863, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5471547.0, "repeat_count": 0.0, - "routers_loss": 0.006201856769621372, + "routers_loss": 0.005084246397018433, "skip_count": 2.0, "step": 3392, "text_loss": 0.35789889097213745 @@ -32241,13 +32241,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.0390625, "learning_rate": 0.0008125911327192299, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5474331.0, "repeat_count": 0.0, - "routers_loss": 0.0009058464202098548, + "routers_loss": 0.0008874498889781535, "skip_count": 0.0, "step": 3394, "text_loss": 0.6267408728599548 @@ -32260,13 +32260,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03173828125, "learning_rate": 0.0008123495017660851, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5477633.0, "repeat_count": 0.0, - "routers_loss": 0.00202162005007267, + "routers_loss": 0.001794386887922883, "skip_count": 0.0, "step": 3396, "text_loss": 0.3701885938644409 @@ -32279,13 +32279,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04296875, + "grad_norm": 0.042724609375, "learning_rate": 0.0008121077511200221, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5481277.0, "repeat_count": 0.0, - "routers_loss": 0.0022049983963370323, + "routers_loss": 0.002140481723472476, "skip_count": 0.0, "step": 3398, "text_loss": 0.6362857818603516 @@ -32298,13 +32298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05322265625, + "grad_norm": 0.0556640625, "learning_rate": 0.00081186588087368, - "loss": 0.0115, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 5484237.0, "repeat_count": 0.0, - "routers_loss": 0.0008255304419435561, + "routers_loss": 0.000867189432028681, "skip_count": 0.0, "step": 3400, "text_loss": 1.0847382545471191 @@ -32317,13 +32317,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0296630859375, "learning_rate": 0.0008116238911197442, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5487423.0, "repeat_count": 0.0, - "routers_loss": 0.0029532560147345066, + "routers_loss": 0.0029817656613886356, "skip_count": 0.0, "step": 3402, "text_loss": 0.3813740313053131 @@ -32336,13 +32336,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.049560546875, "learning_rate": 0.0008113817819509454, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5490155.0, "repeat_count": 0.0, - "routers_loss": 0.0038054194301366806, + "routers_loss": 0.0035141287371516228, "skip_count": 0.0, "step": 3404, "text_loss": 0.2113083451986313 @@ -32355,13 +32355,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.04443359375, "learning_rate": 0.0008111395534600603, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5493415.0, "repeat_count": 0.0, - "routers_loss": 0.0034561967477202415, + "routers_loss": 0.003317659953609109, "skip_count": 0.0, "step": 3406, "text_loss": 0.5869330167770386 @@ -32374,13 +32374,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.052001953125, "learning_rate": 0.0008108972057399114, - "loss": 0.0131, + "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 5496032.0, "repeat_count": 0.0, - "routers_loss": 0.0036799898371100426, + "routers_loss": 0.003833734430372715, "skip_count": 2.0, "step": 3408, "text_loss": 0.2938928008079529 @@ -32393,13 +32393,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.11328125, "learning_rate": 0.0008106547388833669, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5498890.0, "repeat_count": 0.0, - "routers_loss": 0.0026391225401312113, + "routers_loss": 0.002622978063300252, "skip_count": 1.0, "step": 3410, "text_loss": 0.3130980432033539 @@ -32412,13 +32412,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.03564453125, "learning_rate": 0.0008104121529833402, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5502010.0, "repeat_count": 1.0, - "routers_loss": 0.00991886481642723, + "routers_loss": 0.007447598036378622, "skip_count": 0.0, "step": 3412, "text_loss": 0.4413072466850281 @@ -32431,13 +32431,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.03076171875, "learning_rate": 0.000810169448132791, - "loss": 0.0096, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5505212.0, "repeat_count": 0.0, - "routers_loss": 0.0031243201810866594, + "routers_loss": 0.0031087708193808794, "skip_count": 1.0, "step": 3414, "text_loss": 0.2910428047180176 @@ -32450,13 +32450,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.04345703125, "learning_rate": 0.0008099266244247243, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5508755.0, "repeat_count": 0.0, - "routers_loss": 0.02572118304669857, + "routers_loss": 0.02510393038392067, "skip_count": 1.0, "step": 3416, "text_loss": 0.33022749423980713 @@ -32469,13 +32469,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.03662109375, "learning_rate": 0.0008096836819521903, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5512034.0, "repeat_count": 0.0, - "routers_loss": 0.001839894917793572, + "routers_loss": 0.0020537273958325386, "skip_count": 1.0, "step": 3418, "text_loss": 0.4731218218803406 @@ -32488,32 +32488,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.0341796875, "learning_rate": 0.0008094406208082853, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5515707.0, "repeat_count": 0.0, - "routers_loss": 0.0039922320283949375, + "routers_loss": 0.004218162503093481, "skip_count": 2.0, "step": 3420, "text_loss": 0.23429590463638306 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, "epoch": 16.065746991488112, - "f1_execute": 1.0, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, - "f1_skip": 1.0, - "grad_norm": 0.0703125, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, "learning_rate": 0.0008091974410861507, - "loss": 0.0066, - "macro_f1": 1.0, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, "num_tokens": 5518436.0, "repeat_count": 1.0, - "routers_loss": 0.012939191423356533, + "routers_loss": 0.013488355092704296, "skip_count": 3.0, "step": 3422, "text_loss": 0.45768749713897705 @@ -32526,13 +32526,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03369140625, "learning_rate": 0.0008089541428789733, - "loss": 0.01, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5522368.0, "repeat_count": 0.0, - "routers_loss": 0.001064157928340137, + "routers_loss": 0.0010335417464375496, "skip_count": 1.0, "step": 3424, "text_loss": 0.43423423171043396 @@ -32545,13 +32545,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0306396484375, "learning_rate": 0.0008087107262799855, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 5526061.0, "repeat_count": 0.0, - "routers_loss": 0.0024185231886804104, + "routers_loss": 0.002134323585778475, "skip_count": 0.0, "step": 3426, "text_loss": 0.4031757414340973 @@ -32564,13 +32564,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08203125, + "grad_norm": 0.1318359375, "learning_rate": 0.0008084671913824651, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5529284.0, "repeat_count": 0.0, - "routers_loss": 0.009645994752645493, + "routers_loss": 0.0097216060385108, "skip_count": 2.0, "step": 3428, "text_loss": 0.2836039960384369 @@ -32583,13 +32583,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0220947265625, "learning_rate": 0.000808223538279735, - "loss": 0.0051, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5532159.0, "repeat_count": 0.0, - "routers_loss": 0.0017972104251384735, + "routers_loss": 0.001684269867837429, "skip_count": 0.0, "step": 3430, "text_loss": 0.5804527401924133 @@ -32602,13 +32602,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04248046875, + "grad_norm": 0.0390625, "learning_rate": 0.0008079797670651637, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 5536050.0, "repeat_count": 1.0, - "routers_loss": 0.015138664282858372, + "routers_loss": 0.013918434269726276, "skip_count": 1.0, "step": 3432, "text_loss": 0.31325826048851013 @@ -32621,13 +32621,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008077358778321647, - "loss": 0.0114, + "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5538885.0, "repeat_count": 0.0, - "routers_loss": 0.0007666898309253156, + "routers_loss": 0.0007751787197776139, "skip_count": 0.0, "step": 3434, "text_loss": 0.783108115196228 @@ -32640,13 +32640,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.033935546875, "learning_rate": 0.0008074918706741966, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 5541909.0, "repeat_count": 3.0, - "routers_loss": 0.024132754653692245, + "routers_loss": 0.021819550544023514, "skip_count": 2.0, "step": 3436, "text_loss": 0.6558083295822144 @@ -32659,13 +32659,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.02880859375, "learning_rate": 0.0008072477456847638, - "loss": 0.0061, + "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 5545101.0, "repeat_count": 1.0, - "routers_loss": 0.03225114569067955, + "routers_loss": 0.03309348225593567, "skip_count": 0.0, "step": 3438, "text_loss": 0.9877075552940369 @@ -32678,13 +32678,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.04931640625, "learning_rate": 0.0008070035029574151, - "loss": 0.0062, + "loss": 0.006, "macro_f1": 1.0, "num_tokens": 5548971.0, "repeat_count": 1.0, - "routers_loss": 0.008569693192839622, + "routers_loss": 0.008696741424500942, "skip_count": 1.0, "step": 3440, "text_loss": 0.24766330420970917 @@ -32697,13 +32697,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033447265625, "learning_rate": 0.000806759142585745, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 5552174.0, "repeat_count": 0.0, - "routers_loss": 0.004438123665750027, + "routers_loss": 0.004240929149091244, "skip_count": 3.0, "step": 3442, "text_loss": 0.37255001068115234 @@ -32716,13 +32716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.05322265625, "learning_rate": 0.0008065146646633927, - "loss": 0.0091, + "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 5555005.0, "repeat_count": 0.0, - "routers_loss": 0.013728363439440727, + "routers_loss": 0.014345484785735607, "skip_count": 1.0, "step": 3444, "text_loss": 0.26157206296920776 @@ -32735,13 +32735,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.05810546875, + "grad_norm": 0.06005859375, "learning_rate": 0.0008062700692840428, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5559127.0, "repeat_count": 1.0, - "routers_loss": 0.008383825421333313, + "routers_loss": 0.008315163664519787, "skip_count": 2.0, "step": 3446, "text_loss": 0.21971040964126587 @@ -32754,13 +32754,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.04443359375, + "grad_norm": 0.056396484375, "learning_rate": 0.0008060253565414246, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 5562254.0, "repeat_count": 0.0, - "routers_loss": 0.009948022663593292, + "routers_loss": 0.009582413360476494, "skip_count": 3.0, "step": 3448, "text_loss": 0.6758295893669128 @@ -32773,13 +32773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.038818359375, "learning_rate": 0.0008057805265293124, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5565515.0, "repeat_count": 0.0, - "routers_loss": 0.0025822422467172146, + "routers_loss": 0.002429503947496414, "skip_count": 0.0, "step": 3450, "text_loss": 0.696592390537262 @@ -32792,13 +32792,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.041015625, "learning_rate": 0.0008055355793415257, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5568392.0, "repeat_count": 0.0, - "routers_loss": 0.0008777108159847558, + "routers_loss": 0.0007724192109890282, "skip_count": 0.0, "step": 3452, "text_loss": 0.7092870473861694 @@ -32811,13 +32811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.033447265625, "learning_rate": 0.0008052905150719285, - "loss": 0.01, + "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5571090.0, "repeat_count": 0.0, - "routers_loss": 0.0009592860005795956, + "routers_loss": 0.0010859938338398933, "skip_count": 0.0, "step": 3454, "text_loss": 0.6593860387802124 @@ -32832,11 +32832,11 @@ "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008050453338144301, - "loss": 0.0077, + "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 5574552.0, "repeat_count": 1.0, - "routers_loss": 0.0029973683413118124, + "routers_loss": 0.0030258705373853445, "skip_count": 1.0, "step": 3456, "text_loss": 0.3479384481906891 @@ -32849,13 +32849,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.0380859375, "learning_rate": 0.0008048000356629844, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5577484.0, "repeat_count": 0.0, - "routers_loss": 0.005223365034908056, + "routers_loss": 0.005052885971963406, "skip_count": 2.0, "step": 3458, "text_loss": 0.21858671307563782 @@ -32868,13 +32868,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.029541015625, "learning_rate": 0.0008045546207115901, - "loss": 0.0074, + "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 5581605.0, "repeat_count": 1.0, - "routers_loss": 0.010660176165401936, + "routers_loss": 0.009976249188184738, "skip_count": 3.0, "step": 3460, "text_loss": 0.16868001222610474 @@ -32887,13 +32887,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.032958984375, "learning_rate": 0.0008043090890542904, - "loss": 0.008, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5584994.0, "repeat_count": 0.0, - "routers_loss": 0.003038279013708234, + "routers_loss": 0.00270817126147449, "skip_count": 0.0, "step": 3462, "text_loss": 0.785690426826477 @@ -32906,13 +32906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.03173828125, "learning_rate": 0.0008040634407851739, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5588067.0, "repeat_count": 0.0, - "routers_loss": 0.001855011098086834, + "routers_loss": 0.0018436965765431523, "skip_count": 0.0, "step": 3464, "text_loss": 0.5006644129753113 @@ -32925,13 +32925,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.028076171875, "learning_rate": 0.0008038176759983731, - "loss": 0.0064, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5590789.0, "repeat_count": 0.0, - "routers_loss": 0.008276397362351418, + "routers_loss": 0.008516279980540276, "skip_count": 2.0, "step": 3466, "text_loss": 0.20963478088378906 @@ -32944,13 +32944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.0361328125, "learning_rate": 0.0008035717947880659, - "loss": 0.0092, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 5593472.0, "repeat_count": 0.0, - "routers_loss": 0.0016371201490983367, + "routers_loss": 0.0016293043736368418, "skip_count": 0.0, "step": 3468, "text_loss": 0.7376078963279724 @@ -32963,13 +32963,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.036376953125, "learning_rate": 0.0008033257972484742, - "loss": 0.0081, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5596108.0, "repeat_count": 0.0, - "routers_loss": 0.002605364890769124, + "routers_loss": 0.002364142332226038, "skip_count": 0.0, "step": 3470, "text_loss": 0.5156455039978027 @@ -32982,13 +32982,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008030796834738649, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5599103.0, "repeat_count": 0.0, - "routers_loss": 0.00892016664147377, + "routers_loss": 0.008872323669493198, "skip_count": 0.0, "step": 3472, "text_loss": 0.2996419668197632 @@ -33001,13 +33001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.043701171875, "learning_rate": 0.0008028334535585491, - "loss": 0.0089, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5602410.0, "repeat_count": 0.0, - "routers_loss": 0.01095602847635746, + "routers_loss": 0.011508257128298283, "skip_count": 3.0, "step": 3474, "text_loss": 0.25438693165779114 @@ -33020,13 +33020,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.038330078125, "learning_rate": 0.0008025871075968827, - "loss": 0.0105, + "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5605424.0, "repeat_count": 2.0, - "routers_loss": 0.016052749007940292, + "routers_loss": 0.017225435003638268, "skip_count": 2.0, "step": 3476, "text_loss": 0.2549574077129364 @@ -33039,13 +33039,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.028564453125, "learning_rate": 0.0008023406456832657, - "loss": 0.0116, + "loss": 0.0111, "macro_f1": 0.9262410998344421, "num_tokens": 5608266.0, "repeat_count": 3.0, - "routers_loss": 0.04047509655356407, + "routers_loss": 0.039165645837783813, "skip_count": 2.0, "step": 3478, "text_loss": 0.1797947734594345 @@ -33058,13 +33058,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0272216796875, + "grad_norm": 0.026123046875, "learning_rate": 0.0008020940679121429, - "loss": 0.0073, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5611471.0, "repeat_count": 0.0, - "routers_loss": 0.0010115962941199541, + "routers_loss": 0.0009718866203911602, "skip_count": 0.0, "step": 3480, "text_loss": 0.8267702460289001 @@ -33077,13 +33077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008018473743780036, - "loss": 0.0095, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5615046.0, "repeat_count": 0.0, - "routers_loss": 0.006490753497928381, + "routers_loss": 0.006087122485041618, "skip_count": 2.0, "step": 3482, "text_loss": 0.7267677187919617 @@ -33096,13 +33096,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03369140625, "learning_rate": 0.000801600565175381, - "loss": 0.0088, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5618350.0, "repeat_count": 0.0, - "routers_loss": 0.0008378152851946652, + "routers_loss": 0.0007539413054473698, "skip_count": 0.0, "step": 3484, "text_loss": 0.5910211801528931 @@ -33115,13 +33115,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.046142578125, "learning_rate": 0.0008013536403988529, - "loss": 0.0087, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5621381.0, "repeat_count": 0.0, - "routers_loss": 0.0007683819276280701, + "routers_loss": 0.0008076327503658831, "skip_count": 0.0, "step": 3486, "text_loss": 0.30616798996925354 @@ -33134,13 +33134,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.047607421875, + "grad_norm": 0.049072265625, "learning_rate": 0.0008011066001430412, "loss": 0.0086, "macro_f1": 0.6122449040412903, "num_tokens": 5624617.0, "repeat_count": 0.0, - "routers_loss": 0.02481125481426716, + "routers_loss": 0.023835813626646996, "skip_count": 4.0, "step": 3488, "text_loss": 0.3376443088054657 @@ -33153,13 +33153,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03271484375, "learning_rate": 0.0008008594445026122, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5627989.0, "repeat_count": 0.0, - "routers_loss": 0.005174005404114723, + "routers_loss": 0.004226419143378735, "skip_count": 2.0, "step": 3490, "text_loss": 0.8185343146324158 @@ -33172,13 +33172,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008006121735722767, "loss": 0.0084, "macro_f1": 0.32098764181137085, "num_tokens": 5632286.0, "repeat_count": 0.0, - "routers_loss": 0.03602224588394165, + "routers_loss": 0.0366671048104763, "skip_count": 2.0, "step": 3492, "text_loss": 0.2209547609090805 @@ -33191,13 +33191,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.03466796875, "learning_rate": 0.0008003647874467892, - "loss": 0.0087, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 5635368.0, "repeat_count": 1.0, - "routers_loss": 0.012145630083978176, + "routers_loss": 0.012956378981471062, "skip_count": 0.0, "step": 3494, "text_loss": 0.20468664169311523 @@ -33210,13 +33210,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.059814453125, "learning_rate": 0.0008001172862209485, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 5638440.0, "repeat_count": 1.0, - "routers_loss": 0.001456267898902297, + "routers_loss": 0.0017375422175973654, "skip_count": 0.0, "step": 3496, "text_loss": 0.6647221446037292 @@ -33229,13 +33229,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.0244140625, "learning_rate": 0.0007998696699895976, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 0.6592592597007751, "num_tokens": 5641996.0, "repeat_count": 1.0, - "routers_loss": 0.028984347358345985, + "routers_loss": 0.025240756571292877, "skip_count": 5.0, "step": 3498, "text_loss": 0.23892143368721008 @@ -33248,13 +33248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.021728515625, "learning_rate": 0.0007996219388476236, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5645071.0, "repeat_count": 0.0, - "routers_loss": 0.006859986111521721, + "routers_loss": 0.007436830550432205, "skip_count": 1.0, "step": 3500, "text_loss": 0.7580804228782654 @@ -33267,13 +33267,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0242919921875, "learning_rate": 0.0007993740928899571, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5648175.0, "repeat_count": 0.0, - "routers_loss": 0.0011989293852820992, + "routers_loss": 0.001126602990552783, "skip_count": 0.0, "step": 3502, "text_loss": 0.5281378626823425 @@ -33286,13 +33286,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.04443359375, "learning_rate": 0.0007991261322115737, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5650973.0, "repeat_count": 0.0, - "routers_loss": 0.0007974735926836729, + "routers_loss": 0.0007907263352535665, "skip_count": 0.0, "step": 3504, "text_loss": 0.25220927596092224 @@ -33305,13 +33305,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0262451171875, "learning_rate": 0.000798878056907492, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5654252.0, "repeat_count": 2.0, - "routers_loss": 0.007121780421584845, + "routers_loss": 0.006263538729399443, "skip_count": 2.0, "step": 3506, "text_loss": 0.46569153666496277 @@ -33324,13 +33324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0703125, "learning_rate": 0.0007986298670727752, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 5657229.0, "repeat_count": 0.0, - "routers_loss": 0.00414140522480011, + "routers_loss": 0.004049144219607115, "skip_count": 3.0, "step": 3508, "text_loss": 0.15174436569213867 @@ -33343,13 +33343,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.058837890625, + "grad_norm": 0.0791015625, "learning_rate": 0.0007983815628025301, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.9262410998344421, "num_tokens": 5659974.0, "repeat_count": 2.0, - "routers_loss": 0.04618353769183159, + "routers_loss": 0.0471976138651371, "skip_count": 3.0, "step": 3510, "text_loss": 0.39072203636169434 @@ -33362,13 +33362,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03369140625, "learning_rate": 0.000798133144191907, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5662893.0, "repeat_count": 0.0, - "routers_loss": 0.04054548963904381, + "routers_loss": 0.04030488431453705, "skip_count": 1.0, "step": 3512, "text_loss": 0.3562147617340088 @@ -33381,13 +33381,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.0595703125, "learning_rate": 0.0007978846113361009, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5666476.0, "repeat_count": 0.0, - "routers_loss": 0.007785080466419458, + "routers_loss": 0.007475079502910376, "skip_count": 1.0, "step": 3514, "text_loss": 0.26518192887306213 @@ -33400,13 +33400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.044189453125, "learning_rate": 0.0007976359643303497, - "loss": 0.0128, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 5669647.0, "repeat_count": 0.0, - "routers_loss": 0.0057366108521819115, + "routers_loss": 0.00558585487306118, "skip_count": 2.0, "step": 3516, "text_loss": 0.29284560680389404 @@ -33419,13 +33419,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.0361328125, "learning_rate": 0.0007973872032699354, - "loss": 0.0088, + "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 5673491.0, "repeat_count": 1.0, - "routers_loss": 0.002753519220277667, + "routers_loss": 0.0026981087867170572, "skip_count": 1.0, "step": 3518, "text_loss": 0.35089045763015747 @@ -33438,32 +33438,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.033203125, "learning_rate": 0.000797138328250184, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5676529.0, "repeat_count": 1.0, - "routers_loss": 0.0027982397004961967, + "routers_loss": 0.0027328627184033394, "skip_count": 0.0, "step": 3520, "text_loss": 0.41077399253845215 }, { "acc_repeat": 0.0, - "acc_skip": 0.800000011920929, - "avg_layers": 24.0, + "acc_skip": 1.0, + "avg_layers": 23.0, "epoch": 16.535368359260346, - "f1_execute": 0.95652174949646, + "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, - "f1_skip": 0.888888955116272, - "grad_norm": 0.055419921875, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, "learning_rate": 0.0007968893393664646, - "loss": 0.0105, - "macro_f1": 0.6151369214057922, + "loss": 0.01, + "macro_f1": 0.6592592597007751, "num_tokens": 5679987.0, "repeat_count": 1.0, - "routers_loss": 0.03294458985328674, + "routers_loss": 0.02695014327764511, "skip_count": 5.0, "step": 3522, "text_loss": 0.44942837953567505 @@ -33476,13 +33476,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.035400390625, "learning_rate": 0.0007966402367141903, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5683185.0, "repeat_count": 0.0, - "routers_loss": 0.007946476340293884, + "routers_loss": 0.00817026849836111, "skip_count": 2.0, "step": 3524, "text_loss": 0.14528048038482666 @@ -33495,13 +33495,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.021240234375, + "grad_norm": 0.0216064453125, "learning_rate": 0.0007963910203888176, - "loss": 0.0043, + "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 5686544.0, "repeat_count": 0.0, - "routers_loss": 0.0021326798014342785, + "routers_loss": 0.0021973433904349804, "skip_count": 0.0, "step": 3526, "text_loss": 0.22358648478984833 @@ -33514,13 +33514,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0556640625, + "grad_norm": 0.050048828125, "learning_rate": 0.0007961416904858469, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 5689579.0, "repeat_count": 0.0, - "routers_loss": 0.03373958170413971, + "routers_loss": 0.033712416887283325, "skip_count": 1.0, "step": 3528, "text_loss": 0.3083649277687073 @@ -33533,13 +33533,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.0361328125, "learning_rate": 0.0007958922471008217, - "loss": 0.007, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5692869.0, "repeat_count": 0.0, - "routers_loss": 0.010963297449052334, + "routers_loss": 0.011182719841599464, "skip_count": 2.0, "step": 3530, "text_loss": 0.21288011968135834 @@ -33552,13 +33552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0267333984375, "learning_rate": 0.0007956426903293292, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5696007.0, "repeat_count": 0.0, - "routers_loss": 0.0014243065379559994, + "routers_loss": 0.0015808293828740716, "skip_count": 0.0, "step": 3532, "text_loss": 0.6068631410598755 @@ -33571,13 +33571,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.052734375, "learning_rate": 0.0007953930202670001, - "loss": 0.0066, + "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 5699474.0, "repeat_count": 2.0, - "routers_loss": 0.038375116884708405, + "routers_loss": 0.03205178305506706, "skip_count": 0.0, "step": 3534, "text_loss": 0.4317135512828827 @@ -33590,13 +33590,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.064453125, "learning_rate": 0.0007951432370095084, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 5703483.0, "repeat_count": 0.0, - "routers_loss": 0.0041501945815980434, + "routers_loss": 0.003518853336572647, "skip_count": 0.0, "step": 3536, "text_loss": 0.5432273149490356 @@ -33609,13 +33609,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.11083984375, "learning_rate": 0.0007948933406525715, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 5707301.0, "repeat_count": 1.0, - "routers_loss": 0.00536845438182354, + "routers_loss": 0.004982157610356808, "skip_count": 1.0, "step": 3538, "text_loss": 0.40061065554618835 @@ -33628,13 +33628,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.0751953125, "learning_rate": 0.0007946433312919502, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5710847.0, "repeat_count": 0.0, - "routers_loss": 0.0030090278014540672, + "routers_loss": 0.003067734418436885, "skip_count": 0.0, "step": 3540, "text_loss": 0.5396234393119812 @@ -33647,13 +33647,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.055419921875, + "grad_norm": 0.05224609375, "learning_rate": 0.0007943932090234486, - "loss": 0.0098, + "loss": 0.0097, "macro_f1": 0.5492662787437439, "num_tokens": 5713683.0, "repeat_count": 0.0, - "routers_loss": 0.03756432980298996, + "routers_loss": 0.03728383034467697, "skip_count": 2.0, "step": 3542, "text_loss": 0.18310914933681488 @@ -33666,13 +33666,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.03271484375, "learning_rate": 0.0007941429739429138, - "loss": 0.0037, + "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5716397.0, "repeat_count": 0.0, - "routers_loss": 0.002606320893391967, + "routers_loss": 0.0025092530995607376, "skip_count": 3.0, "step": 3544, "text_loss": 0.5806207060813904 @@ -33685,13 +33685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.040283203125, "learning_rate": 0.0007938926261462366, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5719984.0, "repeat_count": 0.0, - "routers_loss": 0.0025650030001997948, + "routers_loss": 0.002493767999112606, "skip_count": 0.0, "step": 3546, "text_loss": 0.38606807589530945 @@ -33704,13 +33704,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.044677734375, + "grad_norm": 0.05078125, "learning_rate": 0.0007936421657293507, "loss": 0.0094, "macro_f1": 0.8823530077934265, "num_tokens": 5723571.0, "repeat_count": 1.0, - "routers_loss": 0.013521218672394753, + "routers_loss": 0.014810923486948013, "skip_count": 2.0, "step": 3548, "text_loss": 0.49558472633361816 @@ -33723,13 +33723,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.0284423828125, "learning_rate": 0.0007933915927882327, - "loss": 0.0071, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5726405.0, "repeat_count": 0.0, - "routers_loss": 0.0014581449795514345, + "routers_loss": 0.00152928801253438, "skip_count": 0.0, "step": 3550, "text_loss": 0.8674797415733337 @@ -33742,13 +33742,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0390625, "learning_rate": 0.000793140907418903, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5729955.0, "repeat_count": 0.0, - "routers_loss": 0.005775467026978731, + "routers_loss": 0.005522782914340496, "skip_count": 2.0, "step": 3552, "text_loss": 0.3274473249912262 @@ -33761,13 +33761,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.0322265625, "learning_rate": 0.0007928901097174248, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5733030.0, "repeat_count": 0.0, - "routers_loss": 0.008668854832649231, + "routers_loss": 0.009207013063132763, "skip_count": 2.0, "step": 3554, "text_loss": 0.18237128853797913 @@ -33780,13 +33780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.0693359375, "learning_rate": 0.0007926391997799039, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5735978.0, "repeat_count": 0.0, - "routers_loss": 0.007210119627416134, + "routers_loss": 0.00695531303063035, "skip_count": 0.0, "step": 3556, "text_loss": 0.3266434967517853 @@ -33799,13 +33799,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0007923881777024898, - "loss": 0.0065, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5738901.0, "repeat_count": 0.0, - "routers_loss": 0.00165808224119246, + "routers_loss": 0.002743212040513754, "skip_count": 1.0, "step": 3558, "text_loss": 0.4971913695335388 @@ -33818,13 +33818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.04931640625, "learning_rate": 0.0007921370435813741, - "loss": 0.0081, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5741946.0, "repeat_count": 1.0, - "routers_loss": 0.007618873380124569, + "routers_loss": 0.007037297356873751, "skip_count": 0.0, "step": 3560, "text_loss": 0.5645473599433899 @@ -33837,13 +33837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.05419921875, "learning_rate": 0.0007918857975127924, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5744987.0, "repeat_count": 0.0, - "routers_loss": 0.0031584161333739758, + "routers_loss": 0.0030746585689485073, "skip_count": 0.0, "step": 3562, "text_loss": 0.17717665433883667 @@ -33856,13 +33856,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.058349609375, "learning_rate": 0.0007916344395930224, - "loss": 0.0079, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5747837.0, "repeat_count": 0.0, - "routers_loss": 0.005207436624914408, + "routers_loss": 0.004522138275206089, "skip_count": 0.0, "step": 3564, "text_loss": 0.7676118612289429 @@ -33875,13 +33875,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.036865234375, "learning_rate": 0.000791382969918385, - "loss": 0.0074, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5750716.0, "repeat_count": 0.0, - "routers_loss": 0.0023729163222014904, + "routers_loss": 0.0026240211445838213, "skip_count": 0.0, "step": 3566, "text_loss": 0.4975173771381378 @@ -33894,13 +33894,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06396484375, "learning_rate": 0.000791131388585244, - "loss": 0.0115, + "loss": 0.011, "macro_f1": 0.8820862174034119, "num_tokens": 5754368.0, "repeat_count": 2.0, - "routers_loss": 0.021537931635975838, + "routers_loss": 0.021831991150975227, "skip_count": 2.0, "step": 3568, "text_loss": 0.9670342206954956 @@ -33913,13 +33913,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.03369140625, "learning_rate": 0.0007908796956900055, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5757076.0, "repeat_count": 1.0, - "routers_loss": 0.001752255018800497, + "routers_loss": 0.0017586691537871957, "skip_count": 0.0, "step": 3570, "text_loss": 0.3057977259159088 @@ -33932,13 +33932,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.05224609375, "learning_rate": 0.000790627891329119, - "loss": 0.006, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5760613.0, "repeat_count": 0.0, - "routers_loss": 0.00557586969807744, + "routers_loss": 0.005515786819159985, "skip_count": 0.0, "step": 3572, "text_loss": 0.5860086679458618 @@ -33951,13 +33951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.04296875, "learning_rate": 0.0007903759755990763, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5763557.0, "repeat_count": 0.0, - "routers_loss": 0.004236271139234304, + "routers_loss": 0.004096484277397394, "skip_count": 0.0, "step": 3574, "text_loss": 0.17175781726837158 @@ -33970,13 +33970,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.04541015625, "learning_rate": 0.000790123948596412, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 5767430.0, "repeat_count": 1.0, - "routers_loss": 0.003505093976855278, + "routers_loss": 0.005216122139245272, "skip_count": 0.0, "step": 3576, "text_loss": 0.7520374059677124 @@ -33989,13 +33989,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.07177734375, "learning_rate": 0.0007898718104177031, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 5770175.0, "repeat_count": 0.0, - "routers_loss": 0.0039036881644278765, + "routers_loss": 0.0037980107590556145, "skip_count": 0.0, "step": 3578, "text_loss": 0.18117885291576385 @@ -34008,13 +34008,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04541015625, "learning_rate": 0.0007896195611595699, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5773032.0, "repeat_count": 0.0, - "routers_loss": 0.00450134975835681, + "routers_loss": 0.003672175807878375, "skip_count": 2.0, "step": 3580, "text_loss": 0.7241058349609375 @@ -34027,13 +34027,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0615234375, "learning_rate": 0.0007893672009186744, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5776077.0, "repeat_count": 1.0, - "routers_loss": 0.01287894882261753, + "routers_loss": 0.01229850109666586, "skip_count": 3.0, "step": 3582, "text_loss": 0.29140418767929077 @@ -34046,13 +34046,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.03271484375, "learning_rate": 0.0007891147297917216, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5779088.0, "repeat_count": 1.0, - "routers_loss": 0.003500303253531456, + "routers_loss": 0.0035251814406365156, "skip_count": 0.0, "step": 3584, "text_loss": 0.1727485954761505 @@ -34065,13 +34065,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.055908203125, "learning_rate": 0.000788862147875459, - "loss": 0.0093, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5782201.0, "repeat_count": 0.0, - "routers_loss": 0.0042770374566316605, + "routers_loss": 0.004725661128759384, "skip_count": 2.0, "step": 3586, "text_loss": 0.43512848019599915 @@ -34084,13 +34084,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.06396484375, "learning_rate": 0.0007886094552666765, - "loss": 0.0107, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5785039.0, "repeat_count": 0.0, - "routers_loss": 0.005349197890609503, + "routers_loss": 0.005632172804325819, "skip_count": 0.0, "step": 3588, "text_loss": 0.3534786105155945 @@ -34103,13 +34103,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0556640625, "learning_rate": 0.0007883566520622062, - "loss": 0.0114, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5788017.0, "repeat_count": 0.0, - "routers_loss": 0.008142824284732342, + "routers_loss": 0.006249965168535709, "skip_count": 1.0, "step": 3590, "text_loss": 0.2089710384607315 @@ -34122,13 +34122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.02978515625, "learning_rate": 0.0007881037383589229, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5791168.0, "repeat_count": 0.0, - "routers_loss": 0.0013415004359558225, + "routers_loss": 0.0013797614956274629, "skip_count": 0.0, "step": 3592, "text_loss": 0.4349329471588135 @@ -34141,13 +34141,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.06982421875, "learning_rate": 0.0007878507142537436, - "loss": 0.0089, + "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5793927.0, "repeat_count": 0.0, - "routers_loss": 0.0022349755745381117, + "routers_loss": 0.0019719740375876427, "skip_count": 1.0, "step": 3594, "text_loss": 0.6087368726730347 @@ -34160,13 +34160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007875975798436274, - "loss": 0.0058, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5797214.0, "repeat_count": 1.0, - "routers_loss": 0.0037436108104884624, + "routers_loss": 0.0037070370744913816, "skip_count": 0.0, "step": 3596, "text_loss": 0.4258122444152832 @@ -34179,13 +34179,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.048583984375, "learning_rate": 0.0007873443352255764, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5800691.0, "repeat_count": 0.0, - "routers_loss": 0.008491694927215576, + "routers_loss": 0.008431311696767807, "skip_count": 0.0, "step": 3598, "text_loss": 0.6006711721420288 @@ -34198,13 +34198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.055419921875, "learning_rate": 0.0007870909804966337, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5804712.0, "repeat_count": 0.0, - "routers_loss": 0.0020895113702863455, + "routers_loss": 0.0017720256000757217, "skip_count": 0.0, "step": 3600, "text_loss": 0.6055042743682861 @@ -34217,13 +34217,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.0517578125, "learning_rate": 0.0007868375157538861, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 5807670.0, "repeat_count": 1.0, - "routers_loss": 0.01193003449589014, + "routers_loss": 0.010697763413190842, "skip_count": 0.0, "step": 3602, "text_loss": 0.8039056658744812 @@ -34236,13 +34236,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.04150390625, "learning_rate": 0.0007865839410944611, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5810880.0, "repeat_count": 1.0, - "routers_loss": 0.003107197815552354, + "routers_loss": 0.0030022128485143185, "skip_count": 0.0, "step": 3604, "text_loss": 0.596110463142395 @@ -34255,13 +34255,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03173828125, "learning_rate": 0.0007863302566155295, - "loss": 0.0098, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5814171.0, "repeat_count": 0.0, - "routers_loss": 0.0075443098321557045, + "routers_loss": 0.006257854867726564, "skip_count": 2.0, "step": 3606, "text_loss": 0.5700319409370422 @@ -34274,13 +34274,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.0294189453125, "learning_rate": 0.0007860764624143031, - "loss": 0.0053, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5817607.0, "repeat_count": 1.0, - "routers_loss": 0.005313992965966463, + "routers_loss": 0.004838473163545132, "skip_count": 0.0, "step": 3608, "text_loss": 0.8319530487060547 @@ -34293,13 +34293,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.08154296875, "learning_rate": 0.0007858225585880369, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.8823530077934265, "num_tokens": 5821452.0, "repeat_count": 1.0, - "routers_loss": 0.020901991054415703, + "routers_loss": 0.02173662930727005, "skip_count": 2.0, "step": 3610, "text_loss": 0.3738477826118469 @@ -34312,13 +34312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.035400390625, "learning_rate": 0.0007855685452340269, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5824683.0, "repeat_count": 0.0, - "routers_loss": 0.002484811469912529, + "routers_loss": 0.0032719180453568697, "skip_count": 0.0, "step": 3612, "text_loss": 0.4054839015007019 @@ -34331,13 +34331,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.0380859375, "learning_rate": 0.0007853144224496118, - "loss": 0.0094, + "loss": 0.0093, "macro_f1": 0.3272727429866791, "num_tokens": 5827860.0, "repeat_count": 1.0, - "routers_loss": 0.032128892838954926, + "routers_loss": 0.032171256840229034, "skip_count": 0.0, "step": 3614, "text_loss": 0.18112395703792572 @@ -34350,13 +34350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05322265625, + "grad_norm": 0.0458984375, "learning_rate": 0.0007850601903321716, - "loss": 0.0062, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5831651.0, "repeat_count": 0.0, - "routers_loss": 0.0136244622990489, + "routers_loss": 0.013230946846306324, "skip_count": 1.0, "step": 3616, "text_loss": 0.2698844075202942 @@ -34369,13 +34369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.0361328125, "learning_rate": 0.000784805848979129, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5834369.0, "repeat_count": 0.0, - "routers_loss": 0.001705345930531621, + "routers_loss": 0.00162619655020535, "skip_count": 0.0, "step": 3618, "text_loss": 0.2430931180715561 @@ -34388,13 +34388,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0546875, + "grad_norm": 0.0498046875, "learning_rate": 0.0007845513984879477, - "loss": 0.0066, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5838102.0, "repeat_count": 1.0, - "routers_loss": 0.002594438148662448, + "routers_loss": 0.002781603019684553, "skip_count": 0.0, "step": 3620, "text_loss": 0.4968300759792328 @@ -34407,13 +34407,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.031005859375, "learning_rate": 0.0007842968389561337, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5841029.0, "repeat_count": 0.0, - "routers_loss": 0.0019142795354127884, + "routers_loss": 0.0023873315658420324, "skip_count": 0.0, "step": 3622, "text_loss": 0.5842974781990051 @@ -34426,13 +34426,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.03955078125, "learning_rate": 0.0007840421704812346, - "loss": 0.0093, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 5845158.0, "repeat_count": 0.0, - "routers_loss": 0.004223407246172428, + "routers_loss": 0.00400173757225275, "skip_count": 1.0, "step": 3624, "text_loss": 0.8312450647354126 @@ -34445,13 +34445,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.035888671875, "learning_rate": 0.00078378739316084, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 5849175.0, "repeat_count": 0.0, - "routers_loss": 0.0005486982990987599, + "routers_loss": 0.0004974664188921452, "skip_count": 0.0, "step": 3626, "text_loss": 0.48637253046035767 @@ -34464,13 +34464,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.0654296875, + "grad_norm": 0.10693359375, "learning_rate": 0.000783532507092581, - "loss": 0.0077, + "loss": 0.0079, "macro_f1": 0.9555556178092957, "num_tokens": 5852020.0, "repeat_count": 1.0, - "routers_loss": 0.025490080937743187, + "routers_loss": 0.02555239573121071, "skip_count": 5.0, "step": 3628, "text_loss": 0.5407033562660217 @@ -34483,13 +34483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.041259765625, "learning_rate": 0.0007832775123741306, - "loss": 0.0104, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5854873.0, "repeat_count": 0.0, - "routers_loss": 0.0026199028361588717, + "routers_loss": 0.0025962977670133114, "skip_count": 0.0, "step": 3630, "text_loss": 0.618230938911438 @@ -34502,13 +34502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0225830078125, + "grad_norm": 0.0234375, "learning_rate": 0.000783022409103203, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5858086.0, "repeat_count": 0.0, - "routers_loss": 0.0028729604091495275, + "routers_loss": 0.0029271875973790884, "skip_count": 0.0, "step": 3632, "text_loss": 0.21259798109531403 @@ -34521,13 +34521,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.064453125, "learning_rate": 0.0007827671973775542, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5860886.0, "repeat_count": 0.0, - "routers_loss": 0.004097428172826767, + "routers_loss": 0.004102068953216076, "skip_count": 0.0, "step": 3634, "text_loss": 0.4991208016872406 @@ -34540,13 +34540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.033203125, "learning_rate": 0.0007825118772949819, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5864291.0, "repeat_count": 0.0, - "routers_loss": 0.002142589772120118, + "routers_loss": 0.0023497689981013536, "skip_count": 1.0, "step": 3636, "text_loss": 0.3878401517868042 @@ -34559,13 +34559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0206298828125, + "grad_norm": 0.0216064453125, "learning_rate": 0.0007822564489533255, - "loss": 0.005, + "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5867155.0, "repeat_count": 0.0, - "routers_loss": 0.006497112102806568, + "routers_loss": 0.007680345326662064, "skip_count": 2.0, "step": 3638, "text_loss": 0.6132124066352844 @@ -34578,13 +34578,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.053466796875, "learning_rate": 0.0007820009124504653, - "loss": 0.0095, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5870325.0, "repeat_count": 0.0, - "routers_loss": 0.0008698388119228184, + "routers_loss": 0.0008242831099778414, "skip_count": 0.0, "step": 3640, "text_loss": 0.3552473187446594 @@ -34597,13 +34597,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.04296875, "learning_rate": 0.0007817452678843236, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.6601307392120361, "num_tokens": 5873301.0, "repeat_count": 1.0, - "routers_loss": 0.022245829924941063, + "routers_loss": 0.023831043392419815, "skip_count": 2.0, "step": 3642, "text_loss": 0.18363867700099945 @@ -34616,13 +34616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0260009765625, "learning_rate": 0.0007814895153528635, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5876225.0, "repeat_count": 0.0, - "routers_loss": 0.0020051905885338783, + "routers_loss": 0.001999989850446582, "skip_count": 0.0, "step": 3644, "text_loss": 0.17581747472286224 @@ -34635,13 +34635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.028564453125, "learning_rate": 0.0007812336549540903, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5879501.0, "repeat_count": 0.0, - "routers_loss": 0.0014994015218690038, + "routers_loss": 0.001098626758903265, "skip_count": 0.0, "step": 3646, "text_loss": 0.5040884613990784 @@ -34654,13 +34654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0294189453125, + "grad_norm": 0.03076171875, "learning_rate": 0.0007809776867860499, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 5882608.0, "repeat_count": 0.0, - "routers_loss": 0.010847748257219791, + "routers_loss": 0.012210183776915073, "skip_count": 1.0, "step": 3648, "text_loss": 0.27114811539649963 @@ -34673,13 +34673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0262451171875, + "grad_norm": 0.032958984375, "learning_rate": 0.00078072161094683, - "loss": 0.006, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5886106.0, "repeat_count": 0.0, - "routers_loss": 0.005927151069045067, + "routers_loss": 0.005191771313548088, "skip_count": 2.0, "step": 3650, "text_loss": 0.5167917609214783 @@ -34692,13 +34692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0235595703125, "learning_rate": 0.0007804654275345591, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5889122.0, "repeat_count": 0.0, - "routers_loss": 0.0019531139405444264, + "routers_loss": 0.0016411367105320096, "skip_count": 1.0, "step": 3652, "text_loss": 0.7691274285316467 @@ -34711,13 +34711,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.03515625, "learning_rate": 0.0007802091366474074, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.8823530077934265, "num_tokens": 5892313.0, "repeat_count": 2.0, - "routers_loss": 0.015216727741062641, + "routers_loss": 0.015627093613147736, "skip_count": 1.0, "step": 3654, "text_loss": 0.4646325409412384 @@ -34730,13 +34730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.0341796875, "learning_rate": 0.0007799527383835858, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5895577.0, "repeat_count": 0.0, - "routers_loss": 0.0009810501942411065, + "routers_loss": 0.0009879748104140162, "skip_count": 0.0, "step": 3656, "text_loss": 0.5587969422340393 @@ -34749,13 +34749,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0007796962328413469, - "loss": 0.0093, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5898546.0, "repeat_count": 0.0, - "routers_loss": 0.00458681071177125, + "routers_loss": 0.004864919930696487, "skip_count": 0.0, "step": 3658, "text_loss": 0.6981375813484192 @@ -34768,13 +34768,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.033447265625, "learning_rate": 0.0007794396201189839, - "loss": 0.0076, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 5901618.0, "repeat_count": 1.0, - "routers_loss": 0.006519644521176815, + "routers_loss": 0.006617432460188866, "skip_count": 2.0, "step": 3660, "text_loss": 0.22521957755088806 @@ -34787,13 +34787,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.036865234375, "learning_rate": 0.0007791829003148312, - "loss": 0.0097, + "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 5904540.0, "repeat_count": 1.0, - "routers_loss": 0.0783558189868927, + "routers_loss": 0.0782252699136734, "skip_count": 2.0, "step": 3662, "text_loss": 0.2649642825126648 @@ -34806,13 +34806,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.06494140625, "learning_rate": 0.0007789260735272647, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 5907827.0, "repeat_count": 0.0, - "routers_loss": 0.0012588179670274258, + "routers_loss": 0.0012057392159476876, "skip_count": 0.0, "step": 3664, "text_loss": 0.6943771243095398 @@ -34825,13 +34825,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0177001953125, + "grad_norm": 0.018310546875, "learning_rate": 0.0007786691398547005, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5911163.0, "repeat_count": 0.0, - "routers_loss": 0.0075621698051691055, + "routers_loss": 0.007476957980543375, "skip_count": 2.0, "step": 3666, "text_loss": 0.1502683162689209 @@ -34844,13 +34844,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0303955078125, + "grad_norm": 0.0322265625, "learning_rate": 0.0007784120993955962, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5913948.0, "repeat_count": 1.0, - "routers_loss": 0.00408853217959404, + "routers_loss": 0.004082011990249157, "skip_count": 0.0, "step": 3668, "text_loss": 0.4127517640590668 @@ -34863,13 +34863,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.041259765625, "learning_rate": 0.0007781549522484503, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.9265305995941162, "num_tokens": 5917360.0, "repeat_count": 3.0, - "routers_loss": 0.02851647138595581, + "routers_loss": 0.027505695819854736, "skip_count": 1.0, "step": 3670, "text_loss": 0.23892618715763092 @@ -34882,13 +34882,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007778976985118018, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5920524.0, "repeat_count": 0.0, - "routers_loss": 0.0030399872921407223, + "routers_loss": 0.0024977331049740314, "skip_count": 2.0, "step": 3672, "text_loss": 0.5076471567153931 @@ -34901,13 +34901,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.0576171875, "learning_rate": 0.0007776403382842312, - "loss": 0.0061, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5923632.0, "repeat_count": 0.0, - "routers_loss": 0.0014176326803863049, + "routers_loss": 0.0015700991498306394, "skip_count": 0.0, "step": 3674, "text_loss": 0.6287924647331238 @@ -34920,13 +34920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.05810546875, "learning_rate": 0.0007773828716643591, - "loss": 0.0084, + "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 5926438.0, "repeat_count": 1.0, - "routers_loss": 0.0505419559776783, + "routers_loss": 0.05108916014432907, "skip_count": 0.0, "step": 3676, "text_loss": 0.26517006754875183 @@ -34939,13 +34939,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03857421875, "learning_rate": 0.0007771252987508474, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5930081.0, "repeat_count": 0.0, - "routers_loss": 0.0034831957891583443, + "routers_loss": 0.003439917229115963, "skip_count": 0.0, "step": 3678, "text_loss": 0.5189079642295837 @@ -34958,13 +34958,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.056884765625, "learning_rate": 0.0007768676196423984, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5933463.0, "repeat_count": 1.0, - "routers_loss": 0.0020620382856577635, + "routers_loss": 0.001935846172273159, "skip_count": 1.0, "step": 3680, "text_loss": 0.6703575849533081 @@ -34972,18 +34972,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 17.286469034341064, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007766098344377553, - "loss": 0.0084, - "macro_f1": 0.32098764181137085, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, "num_tokens": 5937098.0, "repeat_count": 0.0, - "routers_loss": 0.03850153833627701, + "routers_loss": 0.0384826585650444, "skip_count": 2.0, "step": 3682, "text_loss": 0.6424444913864136 @@ -34996,13 +34996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.0301513671875, "learning_rate": 0.0007763519432357018, - "loss": 0.0065, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 5940436.0, "repeat_count": 0.0, - "routers_loss": 0.000853471748996526, + "routers_loss": 0.0008654671837575734, "skip_count": 0.0, "step": 3684, "text_loss": 0.4189988672733307 @@ -35015,13 +35015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.05908203125, "learning_rate": 0.0007760939461350623, - "loss": 0.0107, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5943731.0, "repeat_count": 0.0, - "routers_loss": 0.007630084175616503, + "routers_loss": 0.007468715775758028, "skip_count": 2.0, "step": 3686, "text_loss": 0.2875453233718872 @@ -35036,11 +35036,11 @@ "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007758358432347019, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5946707.0, "repeat_count": 0.0, - "routers_loss": 0.001303135184571147, + "routers_loss": 0.001252831774763763, "skip_count": 0.0, "step": 3688, "text_loss": 0.5093055367469788 @@ -35053,13 +35053,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03271484375, "learning_rate": 0.0007755776346335259, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5949833.0, "repeat_count": 0.0, - "routers_loss": 0.001894078915938735, + "routers_loss": 0.001680848654359579, "skip_count": 0.0, "step": 3690, "text_loss": 0.4031114876270294 @@ -35072,13 +35072,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.0255126953125, "learning_rate": 0.0007753193204304807, - "loss": 0.0056, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5953095.0, "repeat_count": 0.0, - "routers_loss": 0.005708714015781879, + "routers_loss": 0.0047258250415325165, "skip_count": 2.0, "step": 3692, "text_loss": 0.17632785439491272 @@ -35091,13 +35091,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.036376953125, "learning_rate": 0.0007750609007245524, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5955971.0, "repeat_count": 2.0, - "routers_loss": 0.0019924843218177557, + "routers_loss": 0.001980359200388193, "skip_count": 4.0, "step": 3694, "text_loss": 0.3423727750778198 @@ -35110,13 +35110,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0255126953125, + "grad_norm": 0.0238037109375, "learning_rate": 0.0007748023756147679, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5958948.0, "repeat_count": 0.0, - "routers_loss": 0.005303190555423498, + "routers_loss": 0.00511702848598361, "skip_count": 0.0, "step": 3696, "text_loss": 0.28279972076416016 @@ -35129,13 +35129,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.03662109375, "learning_rate": 0.0007745437452001949, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5961819.0, "repeat_count": 0.0, - "routers_loss": 0.0004839526955038309, + "routers_loss": 0.0005220443126745522, "skip_count": 0.0, "step": 3698, "text_loss": 0.4793325662612915 @@ -35148,13 +35148,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.0400390625, "learning_rate": 0.0007742850095799408, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3272727429866791, "num_tokens": 5964625.0, "repeat_count": 1.0, - "routers_loss": 0.06377380341291428, + "routers_loss": 0.06411020457744598, "skip_count": 0.0, "step": 3700, "text_loss": 0.2825184464454651 @@ -35167,13 +35167,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0751953125, "learning_rate": 0.0007740261688531536, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 5967134.0, "repeat_count": 0.0, - "routers_loss": 0.00462002120912075, + "routers_loss": 0.004408109001815319, "skip_count": 3.0, "step": 3702, "text_loss": 0.690429151058197 @@ -35186,13 +35186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.0279541015625, "learning_rate": 0.0007737672231190215, - "loss": 0.0033, + "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5969831.0, "repeat_count": 0.0, - "routers_loss": 0.0006775400252081454, + "routers_loss": 0.0006747521692886949, "skip_count": 0.0, "step": 3704, "text_loss": 0.32556024193763733 @@ -35205,13 +35205,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.031005859375, "learning_rate": 0.0007735081724767732, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5973015.0, "repeat_count": 0.0, - "routers_loss": 0.001372992410324514, + "routers_loss": 0.0020414739847183228, "skip_count": 0.0, "step": 3706, "text_loss": 0.5876469612121582 @@ -35224,13 +35224,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.072265625, "learning_rate": 0.0007732490170256769, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5975778.0, "repeat_count": 1.0, - "routers_loss": 0.005310074891895056, + "routers_loss": 0.005610425490885973, "skip_count": 0.0, "step": 3708, "text_loss": 0.2968577444553375 @@ -35243,13 +35243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05078125, + "grad_norm": 0.05419921875, "learning_rate": 0.0007729897568650422, - "loss": 0.01, + "loss": 0.0097, "macro_f1": 0.3333333432674408, "num_tokens": 5979115.0, "repeat_count": 0.0, - "routers_loss": 0.0012178041506558657, + "routers_loss": 0.001248046406544745, "skip_count": 0.0, "step": 3710, "text_loss": 0.626361608505249 @@ -35262,13 +35262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.06787109375, "learning_rate": 0.0007727303920942176, - "loss": 0.01, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 5982213.0, "repeat_count": 0.0, - "routers_loss": 0.004617640748620033, + "routers_loss": 0.005791695322841406, "skip_count": 2.0, "step": 3712, "text_loss": 0.4133484661579132 @@ -35281,13 +35281,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0791015625, + "grad_norm": 0.08740234375, "learning_rate": 0.0007724709228125922, - "loss": 0.0106, + "loss": 0.0105, "macro_f1": 0.5492662787437439, "num_tokens": 5984930.0, "repeat_count": 0.0, - "routers_loss": 0.020924020558595657, + "routers_loss": 0.02114664763212204, "skip_count": 2.0, "step": 3714, "text_loss": 0.4646461308002472 @@ -35300,13 +35300,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.032958984375, "learning_rate": 0.0007722113491195952, - "loss": 0.0059, + "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 5988017.0, "repeat_count": 2.0, - "routers_loss": 0.0053578754886984825, + "routers_loss": 0.005913930479437113, "skip_count": 5.0, "step": 3716, "text_loss": 0.15474505722522736 @@ -35319,13 +35319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026123046875, + "grad_norm": 0.02685546875, "learning_rate": 0.0007719516711146957, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5991562.0, "repeat_count": 0.0, - "routers_loss": 0.006991801783442497, + "routers_loss": 0.0075925313867628574, "skip_count": 2.0, "step": 3718, "text_loss": 0.5293686985969543 @@ -35338,13 +35338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.037353515625, "learning_rate": 0.000771691888897403, - "loss": 0.0054, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5994675.0, "repeat_count": 0.0, - "routers_loss": 0.0011527709430083632, + "routers_loss": 0.0012335237115621567, "skip_count": 0.0, "step": 3720, "text_loss": 0.5210637450218201 @@ -35357,13 +35357,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0771484375, "learning_rate": 0.0007714320025672657, - "loss": 0.008, + "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 5999070.0, "repeat_count": 0.0, - "routers_loss": 0.011113573797047138, + "routers_loss": 0.010582062415778637, "skip_count": 2.0, "step": 3722, "text_loss": 0.2783571779727936 @@ -35376,13 +35376,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.032958984375, "learning_rate": 0.000771172012223873, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6598639488220215, "num_tokens": 6002702.0, "repeat_count": 1.0, - "routers_loss": 0.014584671705961227, + "routers_loss": 0.015008784830570221, "skip_count": 3.0, "step": 3724, "text_loss": 0.358705073595047 @@ -35395,13 +35395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.052734375, "learning_rate": 0.0007709119179668538, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6005517.0, "repeat_count": 0.0, - "routers_loss": 0.001164636923931539, + "routers_loss": 0.00111615180503577, "skip_count": 0.0, "step": 3726, "text_loss": 0.45202162861824036 @@ -35414,13 +35414,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.034912109375, "learning_rate": 0.0007706517198958764, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.6595745086669922, "num_tokens": 6009111.0, "repeat_count": 1.0, - "routers_loss": 0.05235295370221138, + "routers_loss": 0.05215252563357353, "skip_count": 4.0, "step": 3728, "text_loss": 0.20360413193702698 @@ -35433,13 +35433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05859375, + "grad_norm": 0.053955078125, "learning_rate": 0.0007703914181106497, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6012989.0, "repeat_count": 0.0, - "routers_loss": 0.01087163109332323, + "routers_loss": 0.010039499960839748, "skip_count": 3.0, "step": 3730, "text_loss": 0.20334361493587494 @@ -35452,13 +35452,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.08203125, "learning_rate": 0.0007701310127109211, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6016420.0, "repeat_count": 0.0, - "routers_loss": 0.010110805742442608, + "routers_loss": 0.01090205181390047, "skip_count": 1.0, "step": 3732, "text_loss": 0.47959551215171814 @@ -35471,13 +35471,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.03564453125, + "grad_norm": 0.0341796875, "learning_rate": 0.0007698705037964791, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6225374937057495, "num_tokens": 6019551.0, "repeat_count": 0.0, - "routers_loss": 0.026909299194812775, + "routers_loss": 0.02677762135863304, "skip_count": 5.0, "step": 3734, "text_loss": 0.2621438801288605 @@ -35490,13 +35490,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.056640625, "learning_rate": 0.000769609891467151, - "loss": 0.0122, + "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 6022262.0, "repeat_count": 1.0, - "routers_loss": 0.003602684009820223, + "routers_loss": 0.00460716662928462, "skip_count": 0.0, "step": 3736, "text_loss": 0.3433022201061249 @@ -35509,13 +35509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.037109375, "learning_rate": 0.0007693491758228037, - "loss": 0.005, + "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6025723.0, "repeat_count": 0.0, - "routers_loss": 0.00290105608291924, + "routers_loss": 0.0036111194640398026, "skip_count": 2.0, "step": 3738, "text_loss": 0.38703784346580505 @@ -35528,13 +35528,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0007690883569633442, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6028652.0, "repeat_count": 0.0, - "routers_loss": 0.0031469720415771008, + "routers_loss": 0.003299296135082841, "skip_count": 0.0, "step": 3740, "text_loss": 0.24203069508075714 @@ -35547,13 +35547,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.0277099609375, "learning_rate": 0.0007688274349887188, - "loss": 0.0048, + "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6032280.0, "repeat_count": 0.0, - "routers_loss": 0.0029467069543898106, + "routers_loss": 0.003173880511894822, "skip_count": 0.0, "step": 3742, "text_loss": 0.2827291488647461 @@ -35566,13 +35566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.0302734375, "learning_rate": 0.0007685664099989131, - "loss": 0.0074, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6035111.0, "repeat_count": 0.0, - "routers_loss": 0.0009511710377410054, + "routers_loss": 0.0008576177642680705, "skip_count": 0.0, "step": 3744, "text_loss": 0.43613526225090027 @@ -35585,13 +35585,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.0274658203125, "learning_rate": 0.0007683052820939524, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6038428.0, "repeat_count": 0.0, - "routers_loss": 0.004079817794263363, + "routers_loss": 0.004335585981607437, "skip_count": 2.0, "step": 3746, "text_loss": 1.0385624170303345 @@ -35604,13 +35604,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04052734375, "learning_rate": 0.0007680440513739015, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6041185.0, "repeat_count": 0.0, - "routers_loss": 0.0007996217464096844, + "routers_loss": 0.0008210531086660922, "skip_count": 0.0, "step": 3748, "text_loss": 0.7070431709289551 @@ -35623,13 +35623,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.056640625, "learning_rate": 0.0007677827179388646, - "loss": 0.0088, + "loss": 0.0089, "macro_f1": 1.0, "num_tokens": 6046333.0, "repeat_count": 1.0, - "routers_loss": 0.0047629233449697495, + "routers_loss": 0.003778942162171006, "skip_count": 1.0, "step": 3750, "text_loss": 0.3682238757610321 @@ -35642,13 +35642,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.08984375, "learning_rate": 0.000767521281888985, - "loss": 0.0087, + "loss": 0.009, "macro_f1": 1.0, "num_tokens": 6049528.0, "repeat_count": 1.0, - "routers_loss": 0.0039178295992314816, + "routers_loss": 0.002767334459349513, "skip_count": 1.0, "step": 3752, "text_loss": 0.7619418501853943 @@ -35661,13 +35661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.041015625, "learning_rate": 0.0007672597433244455, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 6053202.0, "repeat_count": 0.0, - "routers_loss": 0.004995788913220167, + "routers_loss": 0.004796457476913929, "skip_count": 2.0, "step": 3754, "text_loss": 0.4157083034515381 @@ -35680,13 +35680,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.06689453125, "learning_rate": 0.0007669981023454682, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 6056609.0, "repeat_count": 0.0, - "routers_loss": 0.0012595724547281861, + "routers_loss": 0.0013067846884950995, "skip_count": 0.0, "step": 3756, "text_loss": 0.4529118537902832 @@ -35699,13 +35699,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.033447265625, "learning_rate": 0.0007667363590523142, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6060504.0, "repeat_count": 0.0, - "routers_loss": 0.0012152433628216386, + "routers_loss": 0.0010285493917763233, "skip_count": 0.0, "step": 3758, "text_loss": 0.8363246321678162 @@ -35718,13 +35718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.055419921875, "learning_rate": 0.0007664745135452844, - "loss": 0.0093, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6063526.0, "repeat_count": 0.0, - "routers_loss": 0.006478998344391584, + "routers_loss": 0.006289863493293524, "skip_count": 3.0, "step": 3760, "text_loss": 0.5313657522201538 @@ -35737,13 +35737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.05517578125, "learning_rate": 0.0007662125659247183, - "loss": 0.0096, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6067147.0, "repeat_count": 0.0, - "routers_loss": 0.003008047351613641, + "routers_loss": 0.0028537956532090902, "skip_count": 0.0, "step": 3762, "text_loss": 0.5668109059333801 @@ -35756,13 +35756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.039794921875, "learning_rate": 0.0007659505162909949, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6070350.0, "repeat_count": 0.0, - "routers_loss": 0.002841299632564187, + "routers_loss": 0.0026814753655344248, "skip_count": 0.0, "step": 3764, "text_loss": 0.4983512759208679 @@ -35775,13 +35775,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.056884765625, "learning_rate": 0.0007656883647445318, - "loss": 0.01, + "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 6073091.0, "repeat_count": 0.0, - "routers_loss": 0.006070348434150219, + "routers_loss": 0.005981382913887501, "skip_count": 1.0, "step": 3766, "text_loss": 0.30372318625450134 @@ -35794,13 +35794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.028564453125, "learning_rate": 0.0007654261113857863, - "loss": 0.0073, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6076244.0, "repeat_count": 0.0, - "routers_loss": 0.0008278369787149131, + "routers_loss": 0.000803640519734472, "skip_count": 0.0, "step": 3768, "text_loss": 0.6100738048553467 @@ -35813,13 +35813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02392578125, + "grad_norm": 0.027587890625, "learning_rate": 0.0007651637563152539, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6078936.0, "repeat_count": 0.0, - "routers_loss": 0.001354316365905106, + "routers_loss": 0.0013324898900464177, "skip_count": 0.0, "step": 3770, "text_loss": 0.4733821153640747 @@ -35832,13 +35832,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.029541015625, "learning_rate": 0.0007649012996334701, - "loss": 0.0051, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6081951.0, "repeat_count": 1.0, - "routers_loss": 0.0019684957806020975, + "routers_loss": 0.0021543330512940884, "skip_count": 0.0, "step": 3772, "text_loss": 0.6794875860214233 @@ -35851,13 +35851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.04541015625, "learning_rate": 0.0007646387414410085, - "loss": 0.0076, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 6085165.0, "repeat_count": 0.0, - "routers_loss": 0.0005270782858133316, + "routers_loss": 0.0005426189745776355, "skip_count": 0.0, "step": 3774, "text_loss": 0.5886107683181763 @@ -35870,13 +35870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.0262451171875, "learning_rate": 0.0007643760818384819, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6088370.0, "repeat_count": 0.0, - "routers_loss": 0.0029050554148852825, + "routers_loss": 0.002537576947361231, "skip_count": 0.0, "step": 3776, "text_loss": 0.23591920733451843 @@ -35889,13 +35889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.03564453125, "learning_rate": 0.0007641133209265423, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6092319.0, "repeat_count": 0.0, - "routers_loss": 0.0026071348693221807, + "routers_loss": 0.002613696036860347, "skip_count": 0.0, "step": 3778, "text_loss": 0.3217754662036896 @@ -35908,13 +35908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.052978515625, "learning_rate": 0.0007638504588058796, - "loss": 0.0101, + "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 6095799.0, "repeat_count": 0.0, - "routers_loss": 0.0008351493743248284, + "routers_loss": 0.0007219464750960469, "skip_count": 0.0, "step": 3780, "text_loss": 0.4276983141899109 @@ -35927,13 +35927,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0263671875, "learning_rate": 0.0007635874955772234, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6098789.0, "repeat_count": 0.0, - "routers_loss": 0.005872148554772139, + "routers_loss": 0.005965052172541618, "skip_count": 3.0, "step": 3782, "text_loss": 0.30936646461486816 @@ -35946,13 +35946,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.07177734375, "learning_rate": 0.0007633244313413417, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6101631.0, "repeat_count": 0.0, - "routers_loss": 0.0007862916099838912, + "routers_loss": 0.0007469559786841273, "skip_count": 0.0, "step": 3784, "text_loss": 0.44460123777389526 @@ -35965,13 +35965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.045654296875, "learning_rate": 0.0007630612661990412, - "loss": 0.0098, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 6105097.0, "repeat_count": 0.0, - "routers_loss": 0.0037640000227838755, + "routers_loss": 0.004300760570913553, "skip_count": 1.0, "step": 3786, "text_loss": 0.41950157284736633 @@ -35984,13 +35984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03857421875, "learning_rate": 0.0007627980002511672, - "loss": 0.0068, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6107847.0, "repeat_count": 0.0, - "routers_loss": 0.0023107193410396576, + "routers_loss": 0.0023050960153341293, "skip_count": 1.0, "step": 3788, "text_loss": 0.48561373353004456 @@ -36003,13 +36003,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0322265625, "learning_rate": 0.0007625346335986039, - "loss": 0.0066, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6110546.0, "repeat_count": 0.0, - "routers_loss": 0.0017923865234479308, + "routers_loss": 0.0018124044872820377, "skip_count": 0.0, "step": 3790, "text_loss": 0.20882295072078705 @@ -36022,13 +36022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0400390625, "learning_rate": 0.0007622711663422735, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6113600.0, "repeat_count": 0.0, - "routers_loss": 0.0007700122077949345, + "routers_loss": 0.0007613401976414025, "skip_count": 0.0, "step": 3792, "text_loss": 0.31751760840415955 @@ -36041,13 +36041,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04248046875, + "grad_norm": 0.0400390625, "learning_rate": 0.0007620075985831375, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6116916.0, "repeat_count": 0.0, - "routers_loss": 0.004986821208149195, + "routers_loss": 0.005452962126582861, "skip_count": 2.0, "step": 3794, "text_loss": 0.3246645927429199 @@ -36060,13 +36060,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0272216796875, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007617439304221956, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6120056.0, "repeat_count": 2.0, - "routers_loss": 0.004177430644631386, + "routers_loss": 0.0043787881731987, "skip_count": 0.0, "step": 3796, "text_loss": 0.4859195947647095 @@ -36079,13 +36079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.02294921875, "learning_rate": 0.0007614801619604856, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6122668.0, "repeat_count": 0.0, - "routers_loss": 0.003494138829410076, + "routers_loss": 0.0033891722559928894, "skip_count": 0.0, "step": 3798, "text_loss": 0.48194369673728943 @@ -36098,13 +36098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.02587890625, "learning_rate": 0.0007612162932990845, - "loss": 0.0063, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6126792.0, "repeat_count": 0.0, - "routers_loss": 0.001831608940847218, + "routers_loss": 0.001883238204754889, "skip_count": 0.0, "step": 3800, "text_loss": 0.3740062117576599 @@ -36117,13 +36117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03076171875, "learning_rate": 0.0007609523245391068, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 6129801.0, "repeat_count": 0.0, - "routers_loss": 0.010433467105031013, + "routers_loss": 0.00882677361369133, "skip_count": 2.0, "step": 3802, "text_loss": 0.5759486556053162 @@ -36136,13 +36136,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007606882557817062, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6133613.0, "repeat_count": 0.0, - "routers_loss": 0.009141471236944199, + "routers_loss": 0.009537030011415482, "skip_count": 2.0, "step": 3804, "text_loss": 0.3217554986476898 @@ -36155,13 +36155,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0235595703125, + "grad_norm": 0.0220947265625, "learning_rate": 0.0007604240871280742, - "loss": 0.0055, + "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6137784.0, "repeat_count": 0.0, - "routers_loss": 0.0024337477516382933, + "routers_loss": 0.0023913346230983734, "skip_count": 0.0, "step": 3806, "text_loss": 0.3718445599079132 @@ -36174,13 +36174,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0007601598186794407, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.6603773832321167, "num_tokens": 6141356.0, "repeat_count": 1.0, - "routers_loss": 0.03635421022772789, + "routers_loss": 0.033796411007642746, "skip_count": 1.0, "step": 3808, "text_loss": 0.2717749774456024 @@ -36193,13 +36193,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.037841796875, "learning_rate": 0.000759895450537074, - "loss": 0.0101, + "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 6144448.0, "repeat_count": 0.0, - "routers_loss": 0.002765925833955407, + "routers_loss": 0.0037919918540865183, "skip_count": 2.0, "step": 3810, "text_loss": 0.5935076475143433 @@ -36212,13 +36212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.03271484375, "learning_rate": 0.0007596309828022803, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6147526.0, "repeat_count": 0.0, - "routers_loss": 0.0009747639996930957, + "routers_loss": 0.0008182782912626863, "skip_count": 0.0, "step": 3812, "text_loss": 0.449336439371109 @@ -36231,13 +36231,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03125, "learning_rate": 0.0007593664155764044, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6150620.0, "repeat_count": 1.0, - "routers_loss": 0.001395601429976523, + "routers_loss": 0.001734903547912836, "skip_count": 0.0, "step": 3814, "text_loss": 0.6647221446037292 @@ -36250,13 +36250,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.037353515625, "learning_rate": 0.0007591017489608286, - "loss": 0.0092, + "loss": 0.0088, "macro_f1": 0.3272727429866791, "num_tokens": 6153714.0, "repeat_count": 1.0, - "routers_loss": 0.048050083220005035, + "routers_loss": 0.04721754416823387, "skip_count": 0.0, "step": 3816, "text_loss": 0.25481200218200684 @@ -36269,13 +36269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007588369830569738, - "loss": 0.0062, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6156974.0, "repeat_count": 0.0, - "routers_loss": 0.00022119733330328017, + "routers_loss": 0.0002484306460246444, "skip_count": 0.0, "step": 3818, "text_loss": 0.7195295691490173 @@ -36288,13 +36288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.031982421875, "learning_rate": 0.0007585721179662988, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6159660.0, "repeat_count": 0.0, - "routers_loss": 0.005448841955512762, + "routers_loss": 0.0051363613456487656, "skip_count": 2.0, "step": 3820, "text_loss": 0.5073586702346802 @@ -36307,13 +36307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.052734375, "learning_rate": 0.0007583071537903005, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6163146.0, "repeat_count": 0.0, - "routers_loss": 0.007093957159668207, + "routers_loss": 0.006719176657497883, "skip_count": 0.0, "step": 3822, "text_loss": 0.6950558423995972 @@ -36326,13 +36326,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.0269775390625, "learning_rate": 0.0007580420906305136, - "loss": 0.007, + "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6166257.0, "repeat_count": 1.0, - "routers_loss": 0.008060536347329617, + "routers_loss": 0.00871267355978489, "skip_count": 3.0, "step": 3824, "text_loss": 0.2549148201942444 @@ -36345,13 +36345,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.022705078125, "learning_rate": 0.0007577769285885109, - "loss": 0.004, + "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6169624.0, "repeat_count": 0.0, - "routers_loss": 0.001302229124121368, + "routers_loss": 0.0015642556827515364, "skip_count": 0.0, "step": 3826, "text_loss": 0.3720305860042572 @@ -36364,13 +36364,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.039306640625, "learning_rate": 0.0007575116677659029, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6172673.0, "repeat_count": 0.0, - "routers_loss": 0.0010101167717948556, + "routers_loss": 0.0011551049537956715, "skip_count": 0.0, "step": 3828, "text_loss": 0.6819429397583008 @@ -36383,13 +36383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.040771484375, "learning_rate": 0.0007572463082643377, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 6175414.0, "repeat_count": 0.0, - "routers_loss": 0.0009081853204406798, + "routers_loss": 0.0008922060951590538, "skip_count": 0.0, "step": 3830, "text_loss": 0.5424665212631226 @@ -36402,13 +36402,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03515625, + "grad_norm": 0.0341796875, "learning_rate": 0.0007569808501855023, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6178701.0, "repeat_count": 0.0, - "routers_loss": 0.0040206871926784515, + "routers_loss": 0.004167596809566021, "skip_count": 1.0, "step": 3832, "text_loss": 0.4429764151573181 @@ -36421,13 +36421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.04931640625, "learning_rate": 0.00075671529363112, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6183036.0, "repeat_count": 0.0, - "routers_loss": 0.0009683453245088458, + "routers_loss": 0.0008732969872653484, "skip_count": 0.0, "step": 3834, "text_loss": 0.8015334010124207 @@ -36440,13 +36440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.03271484375, "learning_rate": 0.0007564496387029531, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6186325.0, "repeat_count": 0.0, - "routers_loss": 0.0021183546632528305, + "routers_loss": 0.0021374202333390713, "skip_count": 1.0, "step": 3836, "text_loss": 0.4233771562576294 @@ -36459,13 +36459,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03369140625, "learning_rate": 0.000756183885502801, - "loss": 0.0059, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6189919.0, "repeat_count": 1.0, - "routers_loss": 0.0034987039398401976, + "routers_loss": 0.004017227329313755, "skip_count": 0.0, "step": 3838, "text_loss": 0.33691394329071045 @@ -36478,13 +36478,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.01953125, + "grad_norm": 0.018310546875, "learning_rate": 0.0007559180341325005, - "loss": 0.0048, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6193412.0, "repeat_count": 0.0, - "routers_loss": 0.001348655903711915, + "routers_loss": 0.0013120946241542697, "skip_count": 0.0, "step": 3840, "text_loss": 0.14970099925994873 @@ -36497,13 +36497,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.029541015625, + "grad_norm": 0.031982421875, "learning_rate": 0.0007556520846939265, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 6196588.0, "repeat_count": 0.0, - "routers_loss": 0.011758741922676563, + "routers_loss": 0.011793316341936588, "skip_count": 2.0, "step": 3842, "text_loss": 0.2714047133922577 @@ -36516,13 +36516,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.031494140625, "learning_rate": 0.0007553860372889914, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6200841.0, "repeat_count": 1.0, - "routers_loss": 0.022454025223851204, + "routers_loss": 0.019968654960393906, "skip_count": 4.0, "step": 3844, "text_loss": 0.23680976033210754 @@ -36535,13 +36535,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.049560546875, + "grad_norm": 0.052490234375, "learning_rate": 0.0007551198920196452, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 6203797.0, "repeat_count": 0.0, - "routers_loss": 0.012088865973055363, + "routers_loss": 0.013615630567073822, "skip_count": 2.0, "step": 3846, "text_loss": 0.25839608907699585 @@ -36554,13 +36554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.0546875, "learning_rate": 0.000754853648987875, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6206790.0, "repeat_count": 0.0, - "routers_loss": 0.0025066444650292397, + "routers_loss": 0.002420815173536539, "skip_count": 1.0, "step": 3848, "text_loss": 0.5358025431632996 @@ -36573,13 +36573,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.033447265625, + "grad_norm": 0.032470703125, "learning_rate": 0.0007545873082957057, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 6209791.0, "repeat_count": 1.0, - "routers_loss": 0.01811581663787365, + "routers_loss": 0.018236197531223297, "skip_count": 3.0, "step": 3850, "text_loss": 0.1463700383901596 @@ -36592,13 +36592,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.034423828125, "learning_rate": 0.0007543208700451998, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6212792.0, "repeat_count": 0.0, - "routers_loss": 0.005889591295272112, + "routers_loss": 0.006242573726922274, "skip_count": 3.0, "step": 3852, "text_loss": 0.9441591501235962 @@ -36611,13 +36611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.031982421875, "learning_rate": 0.0007540543343384565, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6215747.0, "repeat_count": 0.0, - "routers_loss": 0.015324318781495094, + "routers_loss": 0.01451140083372593, "skip_count": 1.0, "step": 3854, "text_loss": 0.41610902547836304 @@ -36630,13 +36630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007537877012776132, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6218593.0, "repeat_count": 0.0, - "routers_loss": 0.0003138817264698446, + "routers_loss": 0.00037674361374229193, "skip_count": 0.0, "step": 3856, "text_loss": 0.6048852205276489 @@ -36649,13 +36649,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0269775390625, + "grad_norm": 0.0255126953125, "learning_rate": 0.0007535209709648439, - "loss": 0.0044, + "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6221315.0, "repeat_count": 1.0, - "routers_loss": 0.006152884569019079, + "routers_loss": 0.005776284262537956, "skip_count": 3.0, "step": 3858, "text_loss": 0.35627537965774536 @@ -36668,13 +36668,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0007532541435023605, - "loss": 0.0048, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6225012.0, "repeat_count": 0.0, - "routers_loss": 0.0009145989897660911, + "routers_loss": 0.0009280376834794879, "skip_count": 0.0, "step": 3860, "text_loss": 0.6440183520317078 @@ -36687,13 +36687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.0224609375, "learning_rate": 0.0007529872189924114, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6227650.0, "repeat_count": 0.0, - "routers_loss": 0.0010246031451970339, + "routers_loss": 0.0009876530384644866, "skip_count": 0.0, "step": 3862, "text_loss": 0.35507893562316895 @@ -36706,13 +36706,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.048828125, "learning_rate": 0.0007527201975372827, - "loss": 0.0046, + "loss": 0.0045, "macro_f1": 0.6603773832321167, "num_tokens": 6230557.0, "repeat_count": 1.0, - "routers_loss": 0.011913667432963848, + "routers_loss": 0.013780162669718266, "skip_count": 1.0, "step": 3864, "text_loss": 0.38958442211151123 @@ -36725,13 +36725,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.04638671875, "learning_rate": 0.0007524530792392977, - "loss": 0.0111, + "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 6233371.0, "repeat_count": 0.0, - "routers_loss": 0.0050127157010138035, + "routers_loss": 0.004849869292229414, "skip_count": 3.0, "step": 3866, "text_loss": 0.3826720714569092 @@ -36744,13 +36744,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.0191650390625, "learning_rate": 0.0007521858642008163, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6236770.0, "repeat_count": 0.0, - "routers_loss": 0.008781078271567822, + "routers_loss": 0.008618295192718506, "skip_count": 1.0, "step": 3868, "text_loss": 0.3596078157424927 @@ -36763,13 +36763,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03076171875, + "grad_norm": 0.029052734375, "learning_rate": 0.0007519185525242363, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6239661.0, "repeat_count": 0.0, - "routers_loss": 0.0014061459805816412, + "routers_loss": 0.0013421972980722785, "skip_count": 0.0, "step": 3870, "text_loss": 0.5585550665855408 @@ -36782,13 +36782,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.026611328125, "learning_rate": 0.0007516511443119916, - "loss": 0.0056, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6242459.0, "repeat_count": 0.0, - "routers_loss": 0.0031452353578060865, + "routers_loss": 0.0038009448908269405, "skip_count": 1.0, "step": 3872, "text_loss": 0.4418395757675171 @@ -36801,13 +36801,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.031982421875, "learning_rate": 0.0007513836396665534, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6245489.0, "repeat_count": 1.0, - "routers_loss": 0.0028979210183024406, + "routers_loss": 0.002785376040264964, "skip_count": 2.0, "step": 3874, "text_loss": 0.551510751247406 @@ -36820,13 +36820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.0234375, "learning_rate": 0.0007511160386904305, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6249014.0, "repeat_count": 0.0, - "routers_loss": 0.0021069799549877644, + "routers_loss": 0.0021424589212983847, "skip_count": 1.0, "step": 3876, "text_loss": 1.0502676963806152 @@ -36839,13 +36839,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.034423828125, "learning_rate": 0.0007508483414861679, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6252357.0, "repeat_count": 0.0, - "routers_loss": 0.0073753902688622475, + "routers_loss": 0.0085759861394763, "skip_count": 1.0, "step": 3878, "text_loss": 0.49212515354156494 @@ -36858,13 +36858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.0361328125, "learning_rate": 0.0007505805481563477, - "loss": 0.0094, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6254975.0, "repeat_count": 0.0, - "routers_loss": 0.0010532810119912028, + "routers_loss": 0.0010723904706537724, "skip_count": 0.0, "step": 3880, "text_loss": 0.7022985816001892 @@ -36877,13 +36877,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.05078125, "learning_rate": 0.0007503126588035887, - "loss": 0.0086, + "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6258001.0, "repeat_count": 1.0, - "routers_loss": 0.012617395259439945, + "routers_loss": 0.012809890322387218, "skip_count": 2.0, "step": 3882, "text_loss": 0.1829151213169098 @@ -36896,13 +36896,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.0439453125, "learning_rate": 0.0007500446735305466, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6261795.0, "repeat_count": 0.0, - "routers_loss": 0.002872605575248599, + "routers_loss": 0.0026790346018970013, "skip_count": 1.0, "step": 3884, "text_loss": 0.20436066389083862 @@ -36915,13 +36915,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.02978515625, + "grad_norm": 0.035888671875, "learning_rate": 0.000749776592439914, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 6265585.0, "repeat_count": 1.0, - "routers_loss": 0.0047233253717422485, + "routers_loss": 0.005243788007646799, "skip_count": 2.0, "step": 3886, "text_loss": 0.4479229748249054 @@ -36934,13 +36934,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.024658203125, "learning_rate": 0.00074950841563442, - "loss": 0.0052, + "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6269039.0, "repeat_count": 0.0, - "routers_loss": 0.007303252816200256, + "routers_loss": 0.007998534478247166, "skip_count": 1.0, "step": 3888, "text_loss": 0.2154676914215088 @@ -36953,13 +36953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.0238037109375, "learning_rate": 0.0007492401432168303, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6272315.0, "repeat_count": 0.0, - "routers_loss": 0.005679785739630461, + "routers_loss": 0.004648822825402021, "skip_count": 1.0, "step": 3890, "text_loss": 0.3375042676925659 @@ -36972,13 +36972,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.045654296875, "learning_rate": 0.0007489717752899477, - "loss": 0.0097, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6275342.0, "repeat_count": 0.0, - "routers_loss": 0.013875136151909828, + "routers_loss": 0.012154200114309788, "skip_count": 1.0, "step": 3892, "text_loss": 0.1964082419872284 @@ -36991,13 +36991,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.0267333984375, "learning_rate": 0.000748703311956611, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6278700.0, "repeat_count": 1.0, - "routers_loss": 0.004874289035797119, + "routers_loss": 0.004610476549714804, "skip_count": 2.0, "step": 3894, "text_loss": 0.26545581221580505 @@ -37010,13 +37010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06201171875, "learning_rate": 0.0007484347533196961, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 6281864.0, "repeat_count": 0.0, - "routers_loss": 0.008282547816634178, + "routers_loss": 0.0075586591847240925, "skip_count": 2.0, "step": 3896, "text_loss": 0.3106999397277832 @@ -37029,13 +37029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0208740234375, + "grad_norm": 0.02099609375, "learning_rate": 0.0007481660994821151, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6284676.0, "repeat_count": 0.0, - "routers_loss": 0.00792533066123724, + "routers_loss": 0.007845268584787846, "skip_count": 1.0, "step": 3898, "text_loss": 0.4094304144382477 @@ -37048,13 +37048,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04052734375, "learning_rate": 0.0007478973505468165, - "loss": 0.0086, + "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6287470.0, "repeat_count": 1.0, - "routers_loss": 0.012142898514866829, + "routers_loss": 0.011116391979157925, "skip_count": 2.0, "step": 3900, "text_loss": 0.1838909536600113 @@ -37067,13 +37067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.0361328125, "learning_rate": 0.0007476285066167857, - "loss": 0.0062, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6290432.0, "repeat_count": 1.0, - "routers_loss": 0.004634121898561716, + "routers_loss": 0.004599364474415779, "skip_count": 0.0, "step": 3902, "text_loss": 0.25872838497161865 @@ -37086,13 +37086,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.046142578125, "learning_rate": 0.0007473595677950439, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 6293557.0, "repeat_count": 0.0, - "routers_loss": 0.001632143510505557, + "routers_loss": 0.0016367282951250672, "skip_count": 1.0, "step": 3904, "text_loss": 0.5272360444068909 @@ -37105,13 +37105,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.032470703125, "learning_rate": 0.0007470905341846492, - "loss": 0.0053, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6295979.0, "repeat_count": 0.0, - "routers_loss": 0.0004961033118888736, + "routers_loss": 0.0004760588926728815, "skip_count": 0.0, "step": 3906, "text_loss": 0.666959822177887 @@ -37124,13 +37124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007468214058886956, - "loss": 0.0074, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6299215.0, "repeat_count": 0.0, - "routers_loss": 0.0007425977964885533, + "routers_loss": 0.000524883100297302, "skip_count": 0.0, "step": 3908, "text_loss": 0.5144801139831543 @@ -37143,13 +37143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.031982421875, "learning_rate": 0.0007465521830103137, - "loss": 0.0081, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6302320.0, "repeat_count": 0.0, - "routers_loss": 0.0015668199630454183, + "routers_loss": 0.0016085522947832942, "skip_count": 0.0, "step": 3910, "text_loss": 0.14342890679836273 @@ -37162,13 +37162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.03857421875, "learning_rate": 0.0007462828656526702, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6305212.0, "repeat_count": 0.0, - "routers_loss": 0.003138904692605138, + "routers_loss": 0.002720315707847476, "skip_count": 2.0, "step": 3912, "text_loss": 0.31109121441841125 @@ -37181,13 +37181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.06884765625, "learning_rate": 0.0007460134539189681, - "loss": 0.0117, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 6308964.0, "repeat_count": 0.0, - "routers_loss": 0.0012123063206672668, + "routers_loss": 0.0010418406454846263, "skip_count": 1.0, "step": 3914, "text_loss": 0.5662030577659607 @@ -37200,13 +37200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.052001953125, "learning_rate": 0.0007457439479124459, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 6313195.0, "repeat_count": 0.0, - "routers_loss": 0.0017939694225788116, + "routers_loss": 0.0020303844939917326, "skip_count": 0.0, "step": 3916, "text_loss": 0.6358339190483093 @@ -37219,13 +37219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0289306640625, "learning_rate": 0.0007454743477363797, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6315949.0, "repeat_count": 0.0, - "routers_loss": 0.0006735047209076583, + "routers_loss": 0.0006592223653569818, "skip_count": 0.0, "step": 3918, "text_loss": 0.35648423433303833 @@ -37238,13 +37238,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0262451171875, "learning_rate": 0.0007452046534940803, - "loss": 0.0078, + "loss": 0.0075, "macro_f1": 0.6603773832321167, "num_tokens": 6319024.0, "repeat_count": 1.0, - "routers_loss": 0.025279851630330086, + "routers_loss": 0.024555351585149765, "skip_count": 1.0, "step": 3920, "text_loss": 0.21955153346061707 @@ -37257,13 +37257,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.035888671875, "learning_rate": 0.0007449348652888952, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6321633.0, "repeat_count": 0.0, - "routers_loss": 0.002887458074837923, + "routers_loss": 0.003606822807341814, "skip_count": 1.0, "step": 3922, "text_loss": 0.6079489588737488 @@ -37276,13 +37276,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007446649832242075, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6325209.0, "repeat_count": 0.0, - "routers_loss": 0.0034941197372972965, + "routers_loss": 0.0035831446293741465, "skip_count": 1.0, "step": 3924, "text_loss": 0.2774808406829834 @@ -37295,13 +37295,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.0311279296875, "learning_rate": 0.0007443950074034368, - "loss": 0.0067, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6327822.0, "repeat_count": 0.0, - "routers_loss": 0.006862608715891838, + "routers_loss": 0.006809544749557972, "skip_count": 2.0, "step": 3926, "text_loss": 0.48236769437789917 @@ -37314,13 +37314,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.049072265625, "learning_rate": 0.0007441249379300381, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6601307392120361, "num_tokens": 6331662.0, "repeat_count": 1.0, - "routers_loss": 0.02176409214735031, + "routers_loss": 0.023832591250538826, "skip_count": 2.0, "step": 3928, "text_loss": 0.7287537455558777 @@ -37333,13 +37333,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.04296875, "learning_rate": 0.0007438547749075028, - "loss": 0.0064, + "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6335801.0, "repeat_count": 1.0, - "routers_loss": 0.013603253290057182, + "routers_loss": 0.011755098588764668, "skip_count": 3.0, "step": 3930, "text_loss": 0.17253030836582184 @@ -37352,13 +37352,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.02685546875, "learning_rate": 0.0007435845184393577, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6338747.0, "repeat_count": 1.0, - "routers_loss": 0.006635789293795824, + "routers_loss": 0.005972472485154867, "skip_count": 0.0, "step": 3932, "text_loss": 0.6400216817855835 @@ -37371,13 +37371,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.033447265625, "learning_rate": 0.0007433141686291657, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6342772.0, "repeat_count": 0.0, - "routers_loss": 0.0032724342308938503, + "routers_loss": 0.0030393085908144712, "skip_count": 1.0, "step": 3934, "text_loss": 0.6865074038505554 @@ -37390,13 +37390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0213623046875, + "grad_norm": 0.020263671875, "learning_rate": 0.0007430437255805252, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6345957.0, "repeat_count": 0.0, - "routers_loss": 0.0007380369352176785, + "routers_loss": 0.0006984061910770833, "skip_count": 0.0, "step": 3936, "text_loss": 0.40398702025413513 @@ -37409,13 +37409,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07275390625, "learning_rate": 0.0007427731893970706, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6349162.0, "repeat_count": 1.0, - "routers_loss": 0.004635625518858433, + "routers_loss": 0.005219762213528156, "skip_count": 0.0, "step": 3938, "text_loss": 0.5951031446456909 @@ -37428,13 +37428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.04541015625, "learning_rate": 0.0007425025601824717, - "loss": 0.0085, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 6352655.0, "repeat_count": 0.0, - "routers_loss": 0.014994140714406967, + "routers_loss": 0.015575960278511047, "skip_count": 3.0, "step": 3940, "text_loss": 0.26689088344573975 @@ -37447,13 +37447,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.03662109375, "learning_rate": 0.0007422318380404346, - "loss": 0.0067, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6355890.0, "repeat_count": 0.0, - "routers_loss": 0.0011694672284647822, + "routers_loss": 0.0012208883417770267, "skip_count": 0.0, "step": 3942, "text_loss": 0.570725679397583 @@ -37466,13 +37466,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0235595703125, "learning_rate": 0.0007419610230746999, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6358891.0, "repeat_count": 1.0, - "routers_loss": 0.003442608518525958, + "routers_loss": 0.0029412026051431894, "skip_count": 0.0, "step": 3944, "text_loss": 0.5521301031112671 @@ -37485,13 +37485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.033447265625, "learning_rate": 0.0007416901153890448, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6361586.0, "repeat_count": 0.0, - "routers_loss": 0.0009970148093998432, + "routers_loss": 0.0010283910669386387, "skip_count": 0.0, "step": 3946, "text_loss": 0.4046417772769928 @@ -37504,13 +37504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.03955078125, "learning_rate": 0.0007414191150872818, - "loss": 0.0078, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6364954.0, "repeat_count": 0.0, - "routers_loss": 0.009517154656350613, + "routers_loss": 0.008222512900829315, "skip_count": 2.0, "step": 3948, "text_loss": 0.2803446352481842 @@ -37523,13 +37523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.03564453125, "learning_rate": 0.0007411480222732583, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6367660.0, "repeat_count": 0.0, - "routers_loss": 0.0012908667558804154, + "routers_loss": 0.001304348581470549, "skip_count": 0.0, "step": 3950, "text_loss": 0.45553359389305115 @@ -37542,13 +37542,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.03759765625, "learning_rate": 0.0007408768370508576, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6371585.0, "repeat_count": 0.0, - "routers_loss": 0.0015499353175982833, + "routers_loss": 0.0016345062758773565, "skip_count": 0.0, "step": 3952, "text_loss": 0.25424402952194214 @@ -37561,13 +37561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007406055595239986, - "loss": 0.007, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6374365.0, "repeat_count": 0.0, - "routers_loss": 0.0005612325621768832, + "routers_loss": 0.0005097290268167853, "skip_count": 0.0, "step": 3954, "text_loss": 0.5856026411056519 @@ -37580,13 +37580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.060546875, "learning_rate": 0.0007403341897966356, - "loss": 0.0063, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6377335.0, "repeat_count": 0.0, - "routers_loss": 0.0024961072485893965, + "routers_loss": 0.002482263371348381, "skip_count": 1.0, "step": 3956, "text_loss": 0.5145615339279175 @@ -37599,32 +37599,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0230712890625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0007400627279727574, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6380799.0, "repeat_count": 0.0, - "routers_loss": 0.0013171056052669883, + "routers_loss": 0.0011743451468646526, "skip_count": 0.0, "step": 3958, "text_loss": 0.31868961453437805 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 18.591722923393014, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0286865234375, "learning_rate": 0.0007397911741563892, - "loss": 0.0054, - "macro_f1": 0.3272727429866791, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, "num_tokens": 6383963.0, "repeat_count": 1.0, - "routers_loss": 0.012845510616898537, + "routers_loss": 0.009861881844699383, "skip_count": 0.0, "step": 3960, "text_loss": 0.21192194521427155 @@ -37637,13 +37637,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.0380859375, "learning_rate": 0.0007395195284515905, - "loss": 0.0099, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6387410.0, "repeat_count": 1.0, - "routers_loss": 0.003112874459475279, + "routers_loss": 0.004189098719507456, "skip_count": 0.0, "step": 3962, "text_loss": 0.5809708833694458 @@ -37656,13 +37656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.036376953125, "learning_rate": 0.0007392477909624567, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6390670.0, "repeat_count": 0.0, - "routers_loss": 0.0019742189906537533, + "routers_loss": 0.001853612600825727, "skip_count": 0.0, "step": 3964, "text_loss": 0.48985618352890015 @@ -37675,13 +37675,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.0308837890625, "learning_rate": 0.0007389759617931182, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6393609.0, "repeat_count": 1.0, - "routers_loss": 0.003850853070616722, + "routers_loss": 0.003303771372884512, "skip_count": 0.0, "step": 3966, "text_loss": 0.28729453682899475 @@ -37694,13 +37694,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0634765625, + "grad_norm": 0.10595703125, "learning_rate": 0.0007387040410477404, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.9452888369560242, "num_tokens": 6396608.0, "repeat_count": 1.0, - "routers_loss": 0.020281648263335228, + "routers_loss": 0.01791577786207199, "skip_count": 4.0, "step": 3968, "text_loss": 0.30386820435523987 @@ -37713,13 +37713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.029541015625, "learning_rate": 0.0007384320288305235, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6399793.0, "repeat_count": 0.0, - "routers_loss": 0.0005419629742391407, + "routers_loss": 0.0005771282012574375, "skip_count": 0.0, "step": 3970, "text_loss": 0.47285011410713196 @@ -37732,13 +37732,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.032958984375, "learning_rate": 0.0007381599252457037, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6403365.0, "repeat_count": 0.0, - "routers_loss": 0.003040255280211568, + "routers_loss": 0.003010645741596818, "skip_count": 0.0, "step": 3972, "text_loss": 0.5313063859939575 @@ -37751,32 +37751,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.038818359375, "learning_rate": 0.000737887730397551, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6406205.0, "repeat_count": 1.0, - "routers_loss": 0.006762589327991009, + "routers_loss": 0.006457438692450523, "skip_count": 0.0, "step": 3974, "text_loss": 0.2323843240737915 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 18.666862342236573, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007376154443903713, - "loss": 0.0086, - "macro_f1": 0.3272727429866791, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, "num_tokens": 6409552.0, "repeat_count": 1.0, - "routers_loss": 0.01173968706279993, + "routers_loss": 0.010693981312215328, "skip_count": 0.0, "step": 3976, "text_loss": 0.6304101943969727 @@ -37789,13 +37789,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03662109375, "learning_rate": 0.0007373430673285051, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6412386.0, "repeat_count": 1.0, - "routers_loss": 0.028297962620854378, + "routers_loss": 0.03116440214216709, "skip_count": 0.0, "step": 3978, "text_loss": 0.23448467254638672 @@ -37808,13 +37808,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.10009765625, "learning_rate": 0.0007370705993163278, - "loss": 0.011, + "loss": 0.0111, "macro_f1": 0.3272727429866791, "num_tokens": 6416054.0, "repeat_count": 1.0, - "routers_loss": 0.010761309415102005, + "routers_loss": 0.011973714455962181, "skip_count": 0.0, "step": 3980, "text_loss": 0.6371755599975586 @@ -37827,13 +37827,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.05224609375, "learning_rate": 0.0007367980404582497, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 6419238.0, "repeat_count": 1.0, - "routers_loss": 0.0057355971075594425, + "routers_loss": 0.005117347463965416, "skip_count": 2.0, "step": 3982, "text_loss": 0.19822923839092255 @@ -37846,13 +37846,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.0296630859375, "learning_rate": 0.0007365253908587158, - "loss": 0.005, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6422122.0, "repeat_count": 0.0, - "routers_loss": 0.0011142889270558953, + "routers_loss": 0.0010648667812347412, "skip_count": 0.0, "step": 3984, "text_loss": 0.566700279712677 @@ -37865,13 +37865,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0263671875, + "grad_norm": 0.025146484375, "learning_rate": 0.0007362526506222058, - "loss": 0.0045, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6425313.0, "repeat_count": 0.0, - "routers_loss": 0.005405326373875141, + "routers_loss": 0.005726494826376438, "skip_count": 0.0, "step": 3986, "text_loss": 0.6568437814712524 @@ -37884,13 +37884,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.0341796875, "learning_rate": 0.0007359798198532343, - "loss": 0.0043, + "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6428422.0, "repeat_count": 1.0, - "routers_loss": 0.005449058022350073, + "routers_loss": 0.004504100419580936, "skip_count": 0.0, "step": 3988, "text_loss": 0.598754346370697 @@ -37903,13 +37903,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007357068986563509, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6431512.0, "repeat_count": 0.0, - "routers_loss": 0.0020256424322724342, + "routers_loss": 0.0019837068393826485, "skip_count": 1.0, "step": 3990, "text_loss": 0.7152895927429199 @@ -37922,13 +37922,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.032470703125, "learning_rate": 0.0007354338871361393, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6434358.0, "repeat_count": 0.0, - "routers_loss": 0.0027240889612585306, + "routers_loss": 0.0026031541638076305, "skip_count": 1.0, "step": 3992, "text_loss": 0.4986513555049896 @@ -37941,13 +37941,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.039306640625, "learning_rate": 0.000735160785397218, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6438175.0, "repeat_count": 0.0, - "routers_loss": 0.0026689881924539804, + "routers_loss": 0.0024831905029714108, "skip_count": 2.0, "step": 3994, "text_loss": 0.4406205713748932 @@ -37960,13 +37960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.035400390625, "learning_rate": 0.0007348875935442401, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6441228.0, "repeat_count": 0.0, - "routers_loss": 0.0010014307918027043, + "routers_loss": 0.0008635876583866775, "skip_count": 0.0, "step": 3996, "text_loss": 0.48884135484695435 @@ -37979,13 +37979,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.03271484375, "learning_rate": 0.0007346143116818932, - "loss": 0.0046, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6444318.0, "repeat_count": 0.0, - "routers_loss": 0.004282998852431774, + "routers_loss": 0.004007008858025074, "skip_count": 0.0, "step": 3998, "text_loss": 0.6669428944587708 @@ -37998,13 +37998,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.08203125, "learning_rate": 0.0007343409399148994, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6448317.0, "repeat_count": 0.0, - "routers_loss": 0.0031171543523669243, + "routers_loss": 0.0031380734872072935, "skip_count": 0.0, "step": 4000, "text_loss": 0.6468493938446045 diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644 --- a/checkpoint-4000/training_args.bin +++ b/checkpoint-4000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 size 5880 diff --git a/checkpoint-5000/model-00002-of-00002.safetensors b/checkpoint-5000/model-00002-of-00002.safetensors index e5a18c77927f4cd1f054dbebaaef463f9874696a..4fd8e45d2e43ad1cbdd82dc191486dde3af7361c 100644 --- a/checkpoint-5000/model-00002-of-00002.safetensors +++ b/checkpoint-5000/model-00002-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae38e7cc34de9f085dfca6da945b517d6b7201575652933d0a76b280ef98f026 +oid sha256:b1d3406d823631d0e3b7578d45263f036789513b216f560f8fe554a64de4a525 size 1481790520 diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt index d2981fc2869c3e6703d3563af98c953ca08d3dc3..30db85d1e2490cf65b2bb12085834ba01ef021de 100644 --- a/checkpoint-5000/optimizer.pt +++ b/checkpoint-5000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3af43e11f60ae0f6ec6be0f47432d6dc5a652d6babd15a6e7d17e30412a5def2 +oid sha256:523375eef8a0adbf6e87d5f4658cef29f76781043100ad34e1f31232264003f7 size 44191162 diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json index 825058617d82e0beb9ce3322b1a8231a1ec1418f..d29ad111e10aba5f6b374584df732a732758afa1 100644 --- a/checkpoint-5000/trainer_state.json +++ b/checkpoint-5000/trainer_state.json @@ -12,18 +12,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 25.0, "epoch": 0.009392427355444672, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.40625, + "grad_norm": 2.25, "learning_rate": 2e-06, - "loss": 0.5484, - "macro_f1": 0.1621621698141098, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, "num_tokens": 3175.0, "repeat_count": 0.0, - "routers_loss": 0.503563642501831, + "routers_loss": 0.4339469373226166, "skip_count": 0.0, "step": 2, "text_loss": 0.3330848515033722 @@ -31,18 +31,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 23.0, "epoch": 0.018784854710889344, - "f1_execute": 0.4864864945411682, + "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.9140625, + "grad_norm": 1.8359375, "learning_rate": 6e-06, - "loss": 0.536, - "macro_f1": 0.1621621698141098, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, "num_tokens": 5816.0, "repeat_count": 0.0, - "routers_loss": 0.4589468538761139, + "routers_loss": 0.4511934816837311, "skip_count": 1.0, "step": 4, "text_loss": 0.4571273922920227 @@ -50,37 +50,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 28.0, "epoch": 0.02817728206633402, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.375, + "grad_norm": 2.234375, "learning_rate": 1e-05, - "loss": 0.5469, - "macro_f1": 0.19999998807907104, + "loss": 0.5113, + "macro_f1": 0.222222238779068, "num_tokens": 9739.0, "repeat_count": 0.0, - "routers_loss": 0.5736724138259888, + "routers_loss": 0.49306994676589966, "skip_count": 0.0, "step": 6, "text_loss": 0.41060560941696167 }, { - "acc_repeat": 1.0, - "acc_skip": 0.5, - "avg_layers": 33.0, + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 0.03756970942177869, - "f1_execute": 0.47058823704719543, - "f1_repeat": 0.1538461595773697, - "f1_skip": 0.222222238779068, - "grad_norm": 1.8515625, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, "learning_rate": 1.4e-05, - "loss": 0.5291, - "macro_f1": 0.28221890330314636, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, "num_tokens": 12869.0, "repeat_count": 1.0, - "routers_loss": 0.49970296025276184, + "routers_loss": 0.48872503638267517, "skip_count": 2.0, "step": 8, "text_loss": 0.36678561568260193 @@ -88,37 +88,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.046962136777223364, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.953125, + "grad_norm": 1.78125, "learning_rate": 1.8e-05, - "loss": 0.5316, - "macro_f1": 0.19999998807907104, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, "num_tokens": 15845.0, "repeat_count": 0.0, - "routers_loss": 0.5153562426567078, + "routers_loss": 0.45077216625213623, "skip_count": 0.0, "step": 10, "text_loss": 0.5597779154777527 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 34.0, + "avg_layers": 26.0, "epoch": 0.05635456413266804, - "f1_execute": 0.5714285373687744, - "f1_repeat": 0.0, - "f1_skip": 0.25, - "grad_norm": 1.6328125, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, "learning_rate": 2.2e-05, - "loss": 0.5051, - "macro_f1": 0.2738095223903656, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, "num_tokens": 19353.0, "repeat_count": 2.0, - "routers_loss": 0.46214747428894043, + "routers_loss": 0.4130440056324005, "skip_count": 3.0, "step": 12, "text_loss": 0.2056603729724884 @@ -126,37 +126,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 27.0, "epoch": 0.06574699148811271, - "f1_execute": 0.5263157486915588, + "f1_execute": 0.6976743936538696, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 2.671875, + "grad_norm": 2.4375, "learning_rate": 2.6e-05, - "loss": 0.5653, - "macro_f1": 0.17543858289718628, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, "num_tokens": 22675.0, "repeat_count": 0.0, - "routers_loss": 0.5300976634025574, + "routers_loss": 0.4582902193069458, "skip_count": 0.0, "step": 14, "text_loss": 0.32989829778671265 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 34.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 0.07513941884355738, - "f1_execute": 0.6153846383094788, + "f1_execute": 0.6829268336296082, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 1.8828125, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, "learning_rate": 3e-05, - "loss": 0.5225, - "macro_f1": 0.20512822270393372, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, "num_tokens": 26022.0, "repeat_count": 0.0, - "routers_loss": 0.473240464925766, + "routers_loss": 0.42910993099212646, "skip_count": 1.0, "step": 16, "text_loss": 0.1353905349969864 @@ -164,18 +164,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 38.0, + "avg_layers": 27.0, "epoch": 0.08453184619900206, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.6015625, + "grad_norm": 1.4765625, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4867, - "macro_f1": 0.19999998807907104, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, "num_tokens": 29251.0, "repeat_count": 0.0, - "routers_loss": 0.4795944094657898, + "routers_loss": 0.3990713059902191, "skip_count": 0.0, "step": 18, "text_loss": 0.3806765377521515 @@ -183,18 +183,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 36.0, + "avg_layers": 26.0, "epoch": 0.09392427355444673, - "f1_execute": 0.6153846383094788, - "f1_repeat": 0.1538461595773697, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, "f1_skip": 0.0, - "grad_norm": 1.3984375, + "grad_norm": 1.3125, "learning_rate": 3.8e-05, - "loss": 0.4718, - "macro_f1": 0.25641027092933655, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, "num_tokens": 32545.0, "repeat_count": 1.0, - "routers_loss": 0.41872408986091614, + "routers_loss": 0.40146592259407043, "skip_count": 0.0, "step": 20, "text_loss": 0.25648367404937744 @@ -202,18 +202,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 26.0, "epoch": 0.1033167009098914, - "f1_execute": 0.6341463327407837, + "f1_execute": 0.7272727489471436, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.7734375, + "grad_norm": 1.625, "learning_rate": 4.2000000000000004e-05, - "loss": 0.4472, - "macro_f1": 0.21138212084770203, + "loss": 0.404, + "macro_f1": 0.24242424964904785, "num_tokens": 36560.0, "repeat_count": 0.0, - "routers_loss": 0.4152105450630188, + "routers_loss": 0.372715026140213, "skip_count": 0.0, "step": 22, "text_loss": 0.2799522578716278 @@ -221,18 +221,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 32.0, + "avg_layers": 27.0, "epoch": 0.11270912826533608, - "f1_execute": 0.5999999642372131, + "f1_execute": 0.7555555105209351, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.8046875, + "grad_norm": 1.6328125, "learning_rate": 4.6e-05, - "loss": 0.4554, - "macro_f1": 0.19999998807907104, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, "num_tokens": 39597.0, "repeat_count": 0.0, - "routers_loss": 0.47541096806526184, + "routers_loss": 0.4504941403865814, "skip_count": 0.0, "step": 24, "text_loss": 0.6635695695877075 @@ -240,18 +240,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 34.0, + "avg_layers": 27.0, "epoch": 0.12210155562078075, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.875, + "grad_norm": 1.7109375, "learning_rate": 5e-05, - "loss": 0.4182, - "macro_f1": 0.2608695924282074, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, "num_tokens": 43080.0, "repeat_count": 0.0, - "routers_loss": 0.37319275736808777, + "routers_loss": 0.3498791456222534, "skip_count": 0.0, "step": 26, "text_loss": 0.7035041451454163 @@ -259,18 +259,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.13149398297622542, - "f1_execute": 0.7826087474822998, + "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4375, + "grad_norm": 1.34375, "learning_rate": 5.4e-05, - "loss": 0.3991, - "macro_f1": 0.2608695924282074, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, "num_tokens": 46406.0, "repeat_count": 0.0, - "routers_loss": 0.3604123294353485, + "routers_loss": 0.31265875697135925, "skip_count": 0.0, "step": 28, "text_loss": 0.6388277411460876 @@ -280,16 +280,16 @@ "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.1408864103316701, - "f1_execute": 0.8979591727256775, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.421875, + "grad_norm": 1.2578125, "learning_rate": 5.800000000000001e-05, - "loss": 0.3827, - "macro_f1": 0.2993197441101074, + "loss": 0.341, + "macro_f1": 0.2857142686843872, "num_tokens": 49966.0, "repeat_count": 0.0, - "routers_loss": 0.35880225896835327, + "routers_loss": 0.3200918138027191, "skip_count": 2.0, "step": 30, "text_loss": 0.17372547090053558 @@ -297,18 +297,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 0.15027883768711475, - "f1_execute": 0.9200000166893005, + "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.4609375, + "grad_norm": 1.4140625, "learning_rate": 6.2e-05, - "loss": 0.3452, - "macro_f1": 0.30666667222976685, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, "num_tokens": 53378.0, "repeat_count": 1.0, - "routers_loss": 0.31086465716362, + "routers_loss": 0.32304447889328003, "skip_count": 1.0, "step": 32, "text_loss": 0.18196581304073334 @@ -316,18 +316,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.15967126504255943, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3671875, + "grad_norm": 1.46875, "learning_rate": 6.6e-05, - "loss": 0.3283, - "macro_f1": 0.3144654333591461, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, "num_tokens": 56933.0, "repeat_count": 0.0, - "routers_loss": 0.2674171030521393, + "routers_loss": 0.24814388155937195, "skip_count": 0.0, "step": 34, "text_loss": 0.28823015093803406 @@ -335,18 +335,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.16906369239800412, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1015625, + "grad_norm": 1.1171875, "learning_rate": 7.000000000000001e-05, - "loss": 0.2849, - "macro_f1": 0.3205128312110901, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, "num_tokens": 60744.0, "repeat_count": 1.0, - "routers_loss": 0.24587315320968628, + "routers_loss": 0.22411039471626282, "skip_count": 0.0, "step": 36, "text_loss": 0.5260357856750488 @@ -354,18 +354,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 31.0, + "avg_layers": 27.0, "epoch": 0.17845611975344877, - "f1_execute": 0.8085106015205383, + "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.3046875, + "grad_norm": 1.484375, "learning_rate": 7.4e-05, - "loss": 0.2616, - "macro_f1": 0.26950353384017944, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, "num_tokens": 64900.0, "repeat_count": 0.0, - "routers_loss": 0.32050269842147827, + "routers_loss": 0.44355395436286926, "skip_count": 0.0, "step": 38, "text_loss": 0.5382097363471985 @@ -373,18 +373,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.18784854710889345, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 1.1796875, + "grad_norm": 1.3828125, "learning_rate": 7.8e-05, - "loss": 0.2084, - "macro_f1": 0.3144654333591461, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, "num_tokens": 68000.0, "repeat_count": 0.0, - "routers_loss": 0.15196125209331512, + "routers_loss": 0.202330082654953, "skip_count": 0.0, "step": 40, "text_loss": 0.5946118831634521 @@ -392,18 +392,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 25.0, "epoch": 0.19724097446433814, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.61328125, + "grad_norm": 0.78125, "learning_rate": 8.2e-05, - "loss": 0.1947, + "loss": 0.21, "macro_f1": 0.3144654333591461, "num_tokens": 70529.0, "repeat_count": 0.0, - "routers_loss": 0.14121046662330627, + "routers_loss": 0.18023855984210968, "skip_count": 0.0, "step": 42, "text_loss": 0.5550904273986816 @@ -416,13 +416,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.50390625, + "grad_norm": 0.609375, "learning_rate": 8.599999999999999e-05, - "loss": 0.1884, + "loss": 0.1918, "macro_f1": 0.32098764181137085, "num_tokens": 73427.0, "repeat_count": 2.0, - "routers_loss": 0.21312278509140015, + "routers_loss": 0.2101590931415558, "skip_count": 0.0, "step": 44, "text_loss": 0.4636923372745514 @@ -435,13 +435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.45703125, + "grad_norm": 0.53125, "learning_rate": 8.999999999999999e-05, - "loss": 0.166, + "loss": 0.1881, "macro_f1": 0.3333333432674408, "num_tokens": 76472.0, "repeat_count": 0.0, - "routers_loss": 0.1184137836098671, + "routers_loss": 0.11800424009561539, "skip_count": 0.0, "step": 46, "text_loss": 0.4187001883983612 @@ -454,13 +454,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.62890625, + "grad_norm": 0.953125, "learning_rate": 9.400000000000001e-05, - "loss": 0.1313, + "loss": 0.1446, "macro_f1": 0.3272727429866791, "num_tokens": 79124.0, "repeat_count": 1.0, - "routers_loss": 0.10897563397884369, + "routers_loss": 0.11632519960403442, "skip_count": 0.0, "step": 48, "text_loss": 0.2253919243812561 @@ -468,18 +468,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 0.2348106838861168, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.4375, + "grad_norm": 0.58984375, "learning_rate": 9.800000000000001e-05, - "loss": 0.1531, - "macro_f1": 0.3272727429866791, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, "num_tokens": 81980.0, "repeat_count": 1.0, - "routers_loss": 0.09979952871799469, + "routers_loss": 0.09669367223978043, "skip_count": 0.0, "step": 50, "text_loss": 0.6053179502487183 @@ -487,18 +487,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 0.2442031112415615, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.515625, + "grad_norm": 0.8515625, "learning_rate": 0.000102, - "loss": 0.1265, - "macro_f1": 0.3272727429866791, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, "num_tokens": 85236.0, "repeat_count": 0.0, - "routers_loss": 0.05543195456266403, + "routers_loss": 0.12471720576286316, "skip_count": 0.0, "step": 52, "text_loss": 0.6027331948280334 @@ -511,13 +511,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.328125, + "grad_norm": 0.421875, "learning_rate": 0.000106, - "loss": 0.1436, + "loss": 0.1473, "macro_f1": 0.32098764181137085, "num_tokens": 88238.0, "repeat_count": 0.0, - "routers_loss": 0.15049344301223755, + "routers_loss": 0.1376056969165802, "skip_count": 2.0, "step": 54, "text_loss": 0.2861751616001129 @@ -530,13 +530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.263671875, + "grad_norm": 0.35546875, "learning_rate": 0.00011, - "loss": 0.1021, + "loss": 0.1082, "macro_f1": 0.3333333432674408, "num_tokens": 91056.0, "repeat_count": 0.0, - "routers_loss": 0.07367338240146637, + "routers_loss": 0.07449393719434738, "skip_count": 0.0, "step": 56, "text_loss": 0.48106974363327026 @@ -544,18 +544,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 26.0, "epoch": 0.2723803933078955, - "f1_execute": 1.0, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25, + "grad_norm": 0.271484375, "learning_rate": 0.000114, - "loss": 0.114, - "macro_f1": 0.3333333432674408, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, "num_tokens": 94987.0, "repeat_count": 0.0, - "routers_loss": 0.03782692551612854, + "routers_loss": 0.07064720243215561, "skip_count": 0.0, "step": 58, "text_loss": 0.3554874658584595 @@ -568,13 +568,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.5390625, "learning_rate": 0.000118, - "loss": 0.1197, + "loss": 0.1234, "macro_f1": 0.32098764181137085, "num_tokens": 97909.0, "repeat_count": 0.0, - "routers_loss": 0.14074955880641937, + "routers_loss": 0.16835889220237732, "skip_count": 2.0, "step": 60, "text_loss": 0.5475804805755615 @@ -587,13 +587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2353515625, "learning_rate": 0.000122, - "loss": 0.1174, + "loss": 0.1224, "macro_f1": 0.3333333432674408, "num_tokens": 101043.0, "repeat_count": 0.0, - "routers_loss": 0.058013737201690674, + "routers_loss": 0.06127442046999931, "skip_count": 0.0, "step": 62, "text_loss": 0.5966938734054565 @@ -606,13 +606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.212890625, "learning_rate": 0.000126, - "loss": 0.0911, + "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 104103.0, "repeat_count": 0.0, - "routers_loss": 0.04936821386218071, + "routers_loss": 0.047825805842876434, "skip_count": 0.0, "step": 64, "text_loss": 0.5480486750602722 @@ -625,13 +625,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.220703125, + "grad_norm": 0.2294921875, "learning_rate": 0.00013000000000000002, - "loss": 0.1107, + "loss": 0.1088, "macro_f1": 0.3006536364555359, "num_tokens": 107009.0, "repeat_count": 1.0, - "routers_loss": 0.2628525495529175, + "routers_loss": 0.275174081325531, "skip_count": 4.0, "step": 66, "text_loss": 0.41714492440223694 @@ -644,13 +644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000134, - "loss": 0.1109, + "loss": 0.1123, "macro_f1": 0.3333333432674408, "num_tokens": 110486.0, "repeat_count": 0.0, - "routers_loss": 0.02859785594046116, + "routers_loss": 0.029025178402662277, "skip_count": 0.0, "step": 68, "text_loss": 0.6775627732276917 @@ -663,13 +663,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.314453125, "learning_rate": 0.00013800000000000002, - "loss": 0.1067, + "loss": 0.1049, "macro_f1": 0.3272727429866791, "num_tokens": 113878.0, "repeat_count": 0.0, - "routers_loss": 0.10459086298942566, + "routers_loss": 0.10141710191965103, "skip_count": 1.0, "step": 70, "text_loss": 0.6678873896598816 @@ -682,13 +682,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2109375, + "grad_norm": 0.248046875, "learning_rate": 0.00014199999999999998, - "loss": 0.1166, + "loss": 0.1119, "macro_f1": 0.3272727429866791, "num_tokens": 116989.0, "repeat_count": 0.0, - "routers_loss": 0.0718551054596901, + "routers_loss": 0.08002066612243652, "skip_count": 1.0, "step": 72, "text_loss": 0.405692994594574 @@ -701,13 +701,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1787109375, "learning_rate": 0.000146, - "loss": 0.1007, + "loss": 0.0944, "macro_f1": 0.3144654333591461, "num_tokens": 119883.0, "repeat_count": 0.0, - "routers_loss": 0.1850946843624115, + "routers_loss": 0.1867009848356247, "skip_count": 3.0, "step": 74, "text_loss": 0.44616150856018066 @@ -720,13 +720,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34375, + "grad_norm": 0.333984375, "learning_rate": 0.00015, - "loss": 0.1019, + "loss": 0.1003, "macro_f1": 0.32098764181137085, "num_tokens": 123325.0, "repeat_count": 0.0, - "routers_loss": 0.09809529036283493, + "routers_loss": 0.07042168825864792, "skip_count": 2.0, "step": 76, "text_loss": 0.11340200901031494 @@ -739,13 +739,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.26171875, "learning_rate": 0.000154, - "loss": 0.1088, + "loss": 0.1066, "macro_f1": 0.32098764181137085, "num_tokens": 126131.0, "repeat_count": 0.0, - "routers_loss": 0.11277207732200623, + "routers_loss": 0.11535373330116272, "skip_count": 2.0, "step": 78, "text_loss": 0.3269135355949402 @@ -758,13 +758,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.255859375, "learning_rate": 0.000158, - "loss": 0.0866, + "loss": 0.0891, "macro_f1": 0.3272727429866791, "num_tokens": 130349.0, "repeat_count": 0.0, - "routers_loss": 0.09079254418611526, + "routers_loss": 0.09497501701116562, "skip_count": 1.0, "step": 80, "text_loss": 0.15273472666740417 @@ -777,13 +777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1943359375, "learning_rate": 0.000162, - "loss": 0.0928, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 133607.0, "repeat_count": 0.0, - "routers_loss": 0.02900076098740101, + "routers_loss": 0.030639523640275, "skip_count": 0.0, "step": 82, "text_loss": 0.282884806394577 @@ -796,13 +796,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1806640625, "learning_rate": 0.00016600000000000002, - "loss": 0.1251, + "loss": 0.1254, "macro_f1": 0.3272727429866791, "num_tokens": 136694.0, "repeat_count": 0.0, - "routers_loss": 0.0763339251279831, + "routers_loss": 0.07906441390514374, "skip_count": 1.0, "step": 84, "text_loss": 0.459094375371933 @@ -817,11 +817,11 @@ "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.00017, - "loss": 0.1064, + "loss": 0.1071, "macro_f1": 0.3144654333591461, "num_tokens": 139966.0, "repeat_count": 1.0, - "routers_loss": 0.13191410899162292, + "routers_loss": 0.1124570444226265, "skip_count": 2.0, "step": 86, "text_loss": 0.29985448718070984 @@ -834,13 +834,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25390625, "learning_rate": 0.000174, - "loss": 0.1055, + "loss": 0.1031, "macro_f1": 0.32098764181137085, "num_tokens": 142788.0, "repeat_count": 2.0, - "routers_loss": 0.21200031042099, + "routers_loss": 0.1966402679681778, "skip_count": 0.0, "step": 88, "text_loss": 0.6435291767120361 @@ -853,13 +853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.349609375, "learning_rate": 0.000178, - "loss": 0.0971, + "loss": 0.0963, "macro_f1": 0.3333333432674408, "num_tokens": 146192.0, "repeat_count": 0.0, - "routers_loss": 0.031911369413137436, + "routers_loss": 0.0325632207095623, "skip_count": 0.0, "step": 90, "text_loss": 0.35170626640319824 @@ -872,13 +872,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2265625, "learning_rate": 0.000182, - "loss": 0.1056, + "loss": 0.1073, "macro_f1": 0.32098764181137085, "num_tokens": 149792.0, "repeat_count": 1.0, - "routers_loss": 0.14131835103034973, + "routers_loss": 0.15115146338939667, "skip_count": 1.0, "step": 92, "text_loss": 0.83159339427948 @@ -891,13 +891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.205078125, "learning_rate": 0.000186, - "loss": 0.1059, + "loss": 0.1073, "macro_f1": 0.3333333432674408, "num_tokens": 152766.0, "repeat_count": 0.0, - "routers_loss": 0.04137955233454704, + "routers_loss": 0.043313540518283844, "skip_count": 0.0, "step": 94, "text_loss": 0.49707934260368347 @@ -910,13 +910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.2138671875, "learning_rate": 0.00019, - "loss": 0.0934, + "loss": 0.0947, "macro_f1": 0.3333333432674408, "num_tokens": 156112.0, "repeat_count": 0.0, - "routers_loss": 0.03163003921508789, + "routers_loss": 0.032021280378103256, "skip_count": 0.0, "step": 96, "text_loss": 0.27608928084373474 @@ -929,13 +929,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2099609375, "learning_rate": 0.000194, - "loss": 0.0847, + "loss": 0.0846, "macro_f1": 0.3076923191547394, "num_tokens": 159454.0, "repeat_count": 2.0, - "routers_loss": 0.2567490339279175, + "routers_loss": 0.24473154544830322, "skip_count": 2.0, "step": 98, "text_loss": 0.6026689410209656 @@ -948,13 +948,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30859375, + "grad_norm": 0.271484375, "learning_rate": 0.00019800000000000002, - "loss": 0.1077, + "loss": 0.1028, "macro_f1": 0.32098764181137085, "num_tokens": 163661.0, "repeat_count": 0.0, - "routers_loss": 0.11468870937824249, + "routers_loss": 0.11468276381492615, "skip_count": 2.0, "step": 100, "text_loss": 0.46733155846595764 @@ -967,13 +967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1806640625, "learning_rate": 0.000202, - "loss": 0.1131, + "loss": 0.1089, "macro_f1": 0.3333333432674408, "num_tokens": 167134.0, "repeat_count": 0.0, - "routers_loss": 0.02124219387769699, + "routers_loss": 0.021144939586520195, "skip_count": 0.0, "step": 102, "text_loss": 0.6362994909286499 @@ -986,13 +986,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1943359375, "learning_rate": 0.000206, - "loss": 0.0624, + "loss": 0.0621, "macro_f1": 0.3272727429866791, "num_tokens": 170433.0, "repeat_count": 0.0, - "routers_loss": 0.06983796507120132, + "routers_loss": 0.06594710797071457, "skip_count": 1.0, "step": 104, "text_loss": 0.4515477120876312 @@ -1005,13 +1005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1591796875, "learning_rate": 0.00021, - "loss": 0.0951, + "loss": 0.0929, "macro_f1": 0.3333333432674408, "num_tokens": 173387.0, "repeat_count": 0.0, - "routers_loss": 0.03467355668544769, + "routers_loss": 0.032923027873039246, "skip_count": 0.0, "step": 106, "text_loss": 0.6638453006744385 @@ -1024,13 +1024,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.240234375, "learning_rate": 0.000214, - "loss": 0.0881, + "loss": 0.0883, "macro_f1": 0.3272727429866791, "num_tokens": 176170.0, "repeat_count": 1.0, - "routers_loss": 0.08142061531543732, + "routers_loss": 0.08034781366586685, "skip_count": 0.0, "step": 108, "text_loss": 1.186936855316162 @@ -1043,13 +1043,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.267578125, "learning_rate": 0.000218, - "loss": 0.0795, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 179877.0, "repeat_count": 0.0, - "routers_loss": 0.08327355235815048, + "routers_loss": 0.07814185321331024, "skip_count": 1.0, "step": 110, "text_loss": 0.5488709211349487 @@ -1062,13 +1062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.2353515625, "learning_rate": 0.000222, - "loss": 0.0943, + "loss": 0.0946, "macro_f1": 0.3333333432674408, "num_tokens": 182726.0, "repeat_count": 0.0, - "routers_loss": 0.019890006631612778, + "routers_loss": 0.01884695515036583, "skip_count": 0.0, "step": 112, "text_loss": 0.5195863842964172 @@ -1081,13 +1081,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.19921875, "learning_rate": 0.00022600000000000002, - "loss": 0.0933, + "loss": 0.0974, "macro_f1": 0.32098764181137085, "num_tokens": 185624.0, "repeat_count": 0.0, - "routers_loss": 0.09992363303899765, + "routers_loss": 0.09657823294401169, "skip_count": 2.0, "step": 114, "text_loss": 0.43858134746551514 @@ -1100,13 +1100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.3046875, "learning_rate": 0.00023, - "loss": 0.0762, + "loss": 0.0753, "macro_f1": 0.3333333432674408, "num_tokens": 188155.0, "repeat_count": 0.0, - "routers_loss": 0.014119029976427555, + "routers_loss": 0.01463601179420948, "skip_count": 0.0, "step": 116, "text_loss": 0.392981618642807 @@ -1119,13 +1119,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.423828125, + "grad_norm": 0.439453125, "learning_rate": 0.00023400000000000002, - "loss": 0.0842, + "loss": 0.0843, "macro_f1": 0.3333333432674408, "num_tokens": 190970.0, "repeat_count": 0.0, - "routers_loss": 0.03976766765117645, + "routers_loss": 0.03859659656882286, "skip_count": 0.0, "step": 118, "text_loss": 0.309179425239563 @@ -1138,13 +1138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.2255859375, "learning_rate": 0.00023799999999999998, - "loss": 0.0517, + "loss": 0.053, "macro_f1": 0.3333333432674408, "num_tokens": 193988.0, "repeat_count": 0.0, - "routers_loss": 0.017428619787096977, + "routers_loss": 0.019092386588454247, "skip_count": 0.0, "step": 120, "text_loss": 0.48543134331703186 @@ -1157,13 +1157,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.35546875, "learning_rate": 0.000242, - "loss": 0.1134, + "loss": 0.1203, "macro_f1": 0.3272727429866791, "num_tokens": 196475.0, "repeat_count": 0.0, - "routers_loss": 0.06965513527393341, + "routers_loss": 0.0619138665497303, "skip_count": 1.0, "step": 122, "text_loss": 0.4615364074707031 @@ -1176,13 +1176,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1875, "learning_rate": 0.000246, - "loss": 0.0984, + "loss": 0.1002, "macro_f1": 0.3272727429866791, "num_tokens": 200045.0, "repeat_count": 1.0, - "routers_loss": 0.10476501286029816, + "routers_loss": 0.09752107411623001, "skip_count": 0.0, "step": 124, "text_loss": 0.15802054107189178 @@ -1195,13 +1195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.00025, - "loss": 0.0771, + "loss": 0.0773, "macro_f1": 0.3333333432674408, "num_tokens": 203214.0, "repeat_count": 0.0, - "routers_loss": 0.028317544609308243, + "routers_loss": 0.02896115928888321, "skip_count": 0.0, "step": 126, "text_loss": 0.4543360471725464 @@ -1214,13 +1214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.390625, + "grad_norm": 0.4296875, "learning_rate": 0.000254, - "loss": 0.0933, + "loss": 0.0973, "macro_f1": 0.3333333432674408, "num_tokens": 206168.0, "repeat_count": 0.0, - "routers_loss": 0.012766432017087936, + "routers_loss": 0.011423567309975624, "skip_count": 0.0, "step": 128, "text_loss": 0.4730179011821747 @@ -1233,13 +1233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.365234375, "learning_rate": 0.00025800000000000004, - "loss": 0.0989, + "loss": 0.099, "macro_f1": 0.3333333432674408, "num_tokens": 209907.0, "repeat_count": 0.0, - "routers_loss": 0.021400077268481255, + "routers_loss": 0.01957600563764572, "skip_count": 0.0, "step": 130, "text_loss": 0.45122358202934265 @@ -1252,13 +1252,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.2060546875, "learning_rate": 0.000262, - "loss": 0.0873, + "loss": 0.0868, "macro_f1": 0.3272727429866791, "num_tokens": 213521.0, "repeat_count": 0.0, - "routers_loss": 0.05025051161646843, + "routers_loss": 0.04882373288273811, "skip_count": 1.0, "step": 132, "text_loss": 0.4341491758823395 @@ -1271,13 +1271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1708984375, "learning_rate": 0.000266, - "loss": 0.085, + "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 216484.0, "repeat_count": 0.0, - "routers_loss": 0.017420046031475067, + "routers_loss": 0.016083380207419395, "skip_count": 0.0, "step": 134, "text_loss": 0.46990111470222473 @@ -1290,13 +1290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.220703125, "learning_rate": 0.00027, - "loss": 0.086, + "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 219398.0, "repeat_count": 0.0, - "routers_loss": 0.018217921257019043, + "routers_loss": 0.01733536459505558, "skip_count": 0.0, "step": 136, "text_loss": 0.4455361068248749 @@ -1309,13 +1309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1806640625, "learning_rate": 0.00027400000000000005, - "loss": 0.0985, + "loss": 0.0997, "macro_f1": 0.3333333432674408, "num_tokens": 222430.0, "repeat_count": 0.0, - "routers_loss": 0.012350660748779774, + "routers_loss": 0.01332803163677454, "skip_count": 0.0, "step": 138, "text_loss": 0.47699397802352905 @@ -1328,13 +1328,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.333984375, "learning_rate": 0.00027800000000000004, "loss": 0.0922, "macro_f1": 0.3144654333591461, "num_tokens": 225458.0, "repeat_count": 1.0, - "routers_loss": 0.14993029832839966, + "routers_loss": 0.14924728870391846, "skip_count": 2.0, "step": 140, "text_loss": 0.5858222842216492 @@ -1347,13 +1347,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.25, "learning_rate": 0.00028199999999999997, - "loss": 0.0791, + "loss": 0.0798, "macro_f1": 0.3144654333591461, "num_tokens": 229365.0, "repeat_count": 1.0, - "routers_loss": 0.17921413481235504, + "routers_loss": 0.1860177218914032, "skip_count": 2.0, "step": 142, "text_loss": 0.5003137588500977 @@ -1366,13 +1366,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.2294921875, "learning_rate": 0.00028599999999999996, - "loss": 0.0535, + "loss": 0.054, "macro_f1": 0.32098764181137085, "num_tokens": 231787.0, "repeat_count": 1.0, - "routers_loss": 0.1420905590057373, + "routers_loss": 0.16498211026191711, "skip_count": 1.0, "step": 144, "text_loss": 0.5026470422744751 @@ -1385,13 +1385,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.29296875, + "grad_norm": 0.306640625, "learning_rate": 0.00029, - "loss": 0.0956, + "loss": 0.0936, "macro_f1": 0.32098764181137085, "num_tokens": 235014.0, "repeat_count": 1.0, - "routers_loss": 0.12468750029802322, + "routers_loss": 0.11801310628652573, "skip_count": 1.0, "step": 146, "text_loss": 0.611888587474823 @@ -1404,13 +1404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1806640625, "learning_rate": 0.000294, - "loss": 0.0879, + "loss": 0.0878, "macro_f1": 0.3333333432674408, "num_tokens": 238210.0, "repeat_count": 0.0, - "routers_loss": 0.024295611307024956, + "routers_loss": 0.02422776259481907, "skip_count": 0.0, "step": 148, "text_loss": 0.2876914143562317 @@ -1423,13 +1423,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1728515625, "learning_rate": 0.000298, - "loss": 0.087, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 241582.0, "repeat_count": 0.0, - "routers_loss": 0.07016433775424957, + "routers_loss": 0.07282499223947525, "skip_count": 2.0, "step": 150, "text_loss": 0.3919292390346527 @@ -1442,13 +1442,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3828125, + "grad_norm": 0.37890625, "learning_rate": 0.000302, - "loss": 0.0782, + "loss": 0.0797, "macro_f1": 0.32098764181137085, "num_tokens": 244621.0, "repeat_count": 1.0, - "routers_loss": 0.18942493200302124, + "routers_loss": 0.20659038424491882, "skip_count": 1.0, "step": 152, "text_loss": 0.4294498860836029 @@ -1461,13 +1461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1787109375, "learning_rate": 0.000306, - "loss": 0.0713, + "loss": 0.072, "macro_f1": 0.3333333432674408, "num_tokens": 247833.0, "repeat_count": 0.0, - "routers_loss": 0.02319060079753399, + "routers_loss": 0.02428400330245495, "skip_count": 0.0, "step": 154, "text_loss": 0.5930765867233276 @@ -1480,13 +1480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15234375, + "grad_norm": 0.1533203125, "learning_rate": 0.00031, - "loss": 0.0778, + "loss": 0.0772, "macro_f1": 0.3333333432674408, "num_tokens": 251349.0, "repeat_count": 0.0, - "routers_loss": 0.01764747127890587, + "routers_loss": 0.0167869683355093, "skip_count": 0.0, "step": 156, "text_loss": 0.41063904762268066 @@ -1499,13 +1499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1572265625, "learning_rate": 0.000314, - "loss": 0.0829, + "loss": 0.0821, "macro_f1": 0.3333333432674408, "num_tokens": 254886.0, "repeat_count": 0.0, - "routers_loss": 0.02268100716173649, + "routers_loss": 0.02531604655086994, "skip_count": 0.0, "step": 158, "text_loss": 0.6739020347595215 @@ -1518,13 +1518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.201171875, "learning_rate": 0.00031800000000000003, - "loss": 0.0889, + "loss": 0.09, "macro_f1": 0.3333333432674408, "num_tokens": 258260.0, "repeat_count": 0.0, - "routers_loss": 0.016952091827988625, + "routers_loss": 0.017772775143384933, "skip_count": 0.0, "step": 160, "text_loss": 0.46873849630355835 @@ -1537,13 +1537,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2216796875, + "grad_norm": 0.224609375, "learning_rate": 0.000322, - "loss": 0.0923, + "loss": 0.0893, "macro_f1": 0.3272727429866791, "num_tokens": 261846.0, "repeat_count": 0.0, - "routers_loss": 0.03669808804988861, + "routers_loss": 0.034902360290288925, "skip_count": 1.0, "step": 162, "text_loss": 0.3727971017360687 @@ -1556,13 +1556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.212890625, "learning_rate": 0.000326, - "loss": 0.0769, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 264348.0, "repeat_count": 0.0, - "routers_loss": 0.012101447209715843, + "routers_loss": 0.013553355820477009, "skip_count": 0.0, "step": 164, "text_loss": 0.5798237323760986 @@ -1575,13 +1575,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.37109375, + "grad_norm": 0.408203125, "learning_rate": 0.00033, - "loss": 0.0897, + "loss": 0.0926, "macro_f1": 0.32098764181137085, "num_tokens": 267479.0, "repeat_count": 1.0, - "routers_loss": 0.1562056541442871, + "routers_loss": 0.13571743667125702, "skip_count": 1.0, "step": 166, "text_loss": 0.8084776997566223 @@ -1594,13 +1594,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2431640625, "learning_rate": 0.00033400000000000004, - "loss": 0.0829, + "loss": 0.0817, "macro_f1": 0.32098764181137085, "num_tokens": 270268.0, "repeat_count": 2.0, - "routers_loss": 0.20807914435863495, + "routers_loss": 0.19884146749973297, "skip_count": 0.0, "step": 168, "text_loss": 0.7366134524345398 @@ -1613,13 +1613,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.267578125, "learning_rate": 0.00033800000000000003, - "loss": 0.0987, + "loss": 0.1022, "macro_f1": 0.32098764181137085, "num_tokens": 273518.0, "repeat_count": 1.0, - "routers_loss": 0.1530539095401764, + "routers_loss": 0.15469175577163696, "skip_count": 1.0, "step": 170, "text_loss": 0.27204006910324097 @@ -1632,13 +1632,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.17578125, "learning_rate": 0.000342, - "loss": 0.087, + "loss": 0.0865, "macro_f1": 0.32098764181137085, "num_tokens": 277210.0, "repeat_count": 0.0, - "routers_loss": 0.08004544675350189, + "routers_loss": 0.08603330701589584, "skip_count": 2.0, "step": 172, "text_loss": 0.7137667536735535 @@ -1651,13 +1651,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.189453125, "learning_rate": 0.000346, - "loss": 0.0916, + "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 280389.0, "repeat_count": 0.0, - "routers_loss": 0.19228078424930573, + "routers_loss": 0.17851492762565613, "skip_count": 4.0, "step": 174, "text_loss": 0.5148105621337891 @@ -1670,13 +1670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1494140625, "learning_rate": 0.00035, - "loss": 0.0863, + "loss": 0.0853, "macro_f1": 0.3333333432674408, "num_tokens": 283501.0, "repeat_count": 0.0, - "routers_loss": 0.024507170543074608, + "routers_loss": 0.021331604570150375, "skip_count": 0.0, "step": 176, "text_loss": 0.301013320684433 @@ -1689,13 +1689,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2158203125, "learning_rate": 0.000354, - "loss": 0.0898, + "loss": 0.0911, "macro_f1": 0.32098764181137085, "num_tokens": 287154.0, "repeat_count": 0.0, - "routers_loss": 0.05055495724081993, + "routers_loss": 0.057273946702480316, "skip_count": 2.0, "step": 178, "text_loss": 0.4740981459617615 @@ -1708,13 +1708,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.240234375, "learning_rate": 0.000358, - "loss": 0.0865, + "loss": 0.0904, "macro_f1": 0.3272727429866791, "num_tokens": 289929.0, "repeat_count": 0.0, - "routers_loss": 0.03999815881252289, + "routers_loss": 0.04116598889231682, "skip_count": 1.0, "step": 180, "text_loss": 0.4838573932647705 @@ -1727,13 +1727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.14453125, "learning_rate": 0.000362, - "loss": 0.0983, + "loss": 0.0991, "macro_f1": 0.3333333432674408, "num_tokens": 294293.0, "repeat_count": 0.0, - "routers_loss": 0.025158070027828217, + "routers_loss": 0.027111956849694252, "skip_count": 0.0, "step": 182, "text_loss": 0.7495553493499756 @@ -1746,32 +1746,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.158203125, "learning_rate": 0.000366, - "loss": 0.1015, + "loss": 0.1038, "macro_f1": 0.3333333432674408, "num_tokens": 297730.0, "repeat_count": 0.0, - "routers_loss": 0.01825365424156189, + "routers_loss": 0.019166452810168266, "skip_count": 0.0, "step": 184, "text_loss": 0.534831166267395 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 0.8734957440563546, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, "learning_rate": 0.00037, - "loss": 0.0736, - "macro_f1": 0.3144654333591461, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, "num_tokens": 300593.0, "repeat_count": 1.0, - "routers_loss": 0.22729666531085968, + "routers_loss": 0.2349659502506256, "skip_count": 2.0, "step": 186, "text_loss": 0.3549048602581024 @@ -1784,13 +1784,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.2041015625, "learning_rate": 0.000374, - "loss": 0.0838, + "loss": 0.0827, "macro_f1": 0.3076923191547394, "num_tokens": 303456.0, "repeat_count": 2.0, - "routers_loss": 0.24516475200653076, + "routers_loss": 0.22502389550209045, "skip_count": 2.0, "step": 188, "text_loss": 0.8837642073631287 @@ -1803,13 +1803,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2470703125, + "grad_norm": 0.271484375, "learning_rate": 0.000378, - "loss": 0.1056, + "loss": 0.1085, "macro_f1": 0.3272727429866791, "num_tokens": 306241.0, "repeat_count": 1.0, - "routers_loss": 0.1307530701160431, + "routers_loss": 0.12291611731052399, "skip_count": 0.0, "step": 190, "text_loss": 0.73353511095047 @@ -1822,13 +1822,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15625, "learning_rate": 0.000382, - "loss": 0.0961, + "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 310606.0, "repeat_count": 0.0, - "routers_loss": 0.06541688740253448, + "routers_loss": 0.055988848209381104, "skip_count": 1.0, "step": 192, "text_loss": 0.6261917352676392 @@ -1841,13 +1841,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.333984375, + "grad_norm": 0.34375, "learning_rate": 0.000386, - "loss": 0.1058, + "loss": 0.1055, "macro_f1": 0.3144654333591461, "num_tokens": 313564.0, "repeat_count": 0.0, - "routers_loss": 0.12492545694112778, + "routers_loss": 0.12363404780626297, "skip_count": 3.0, "step": 194, "text_loss": 0.2790874242782593 @@ -1860,13 +1860,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28515625, + "grad_norm": 0.27734375, "learning_rate": 0.00039000000000000005, - "loss": 0.0966, + "loss": 0.0964, "macro_f1": 0.3076923191547394, "num_tokens": 316958.0, "repeat_count": 2.0, - "routers_loss": 0.2838033139705658, + "routers_loss": 0.2718356251716614, "skip_count": 2.0, "step": 196, "text_loss": 0.14428086578845978 @@ -1881,11 +1881,11 @@ "f1_skip": 0.0, "grad_norm": 0.2021484375, "learning_rate": 0.00039400000000000004, - "loss": 0.0929, + "loss": 0.0917, "macro_f1": 0.32098764181137085, "num_tokens": 320103.0, "repeat_count": 0.0, - "routers_loss": 0.07692629098892212, + "routers_loss": 0.07188102602958679, "skip_count": 2.0, "step": 198, "text_loss": 0.27155816555023193 @@ -1898,13 +1898,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.201171875, "learning_rate": 0.000398, "loss": 0.0809, "macro_f1": 0.32098764181137085, "num_tokens": 323566.0, "repeat_count": 1.0, - "routers_loss": 0.18504399061203003, + "routers_loss": 0.18038256466388702, "skip_count": 1.0, "step": 200, "text_loss": 0.8453494310379028 @@ -1917,13 +1917,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2490234375, "learning_rate": 0.000402, - "loss": 0.078, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 326385.0, "repeat_count": 0.0, - "routers_loss": 0.014647359028458595, + "routers_loss": 0.014639763161540031, "skip_count": 0.0, "step": 202, "text_loss": 0.5733131766319275 @@ -1936,13 +1936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2041015625, + "grad_norm": 0.21875, "learning_rate": 0.00040600000000000006, - "loss": 0.1028, + "loss": 0.104, "macro_f1": 0.3333333432674408, "num_tokens": 329266.0, "repeat_count": 0.0, - "routers_loss": 0.017848484218120575, + "routers_loss": 0.015269627794623375, "skip_count": 0.0, "step": 204, "text_loss": 0.7355639934539795 @@ -1955,13 +1955,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.27734375, "learning_rate": 0.00041, - "loss": 0.0832, + "loss": 0.0833, "macro_f1": 0.3333333432674408, "num_tokens": 332984.0, "repeat_count": 0.0, - "routers_loss": 0.01900508813560009, + "routers_loss": 0.018046971410512924, "skip_count": 0.0, "step": 206, "text_loss": 0.587641179561615 @@ -1974,13 +1974,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.185546875, "learning_rate": 0.000414, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 335739.0, "repeat_count": 1.0, - "routers_loss": 0.13018715381622314, + "routers_loss": 0.12791286408901215, "skip_count": 0.0, "step": 208, "text_loss": 0.6538406610488892 @@ -1993,13 +1993,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.24609375, "learning_rate": 0.00041799999999999997, - "loss": 0.0697, + "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 338966.0, "repeat_count": 0.0, - "routers_loss": 0.055288366973400116, + "routers_loss": 0.050490595400333405, "skip_count": 1.0, "step": 210, "text_loss": 0.4188295602798462 @@ -2012,13 +2012,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.271484375, "learning_rate": 0.000422, - "loss": 0.0576, + "loss": 0.0588, "macro_f1": 0.3144654333591461, "num_tokens": 342063.0, "repeat_count": 0.0, - "routers_loss": 0.10952572524547577, + "routers_loss": 0.11652113497257233, "skip_count": 3.0, "step": 212, "text_loss": 0.21822240948677063 @@ -2031,13 +2031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.197265625, + "grad_norm": 0.2060546875, "learning_rate": 0.000426, - "loss": 0.062, + "loss": 0.0621, "macro_f1": 0.3333333432674408, "num_tokens": 344887.0, "repeat_count": 0.0, - "routers_loss": 0.02415696159005165, + "routers_loss": 0.023898238316178322, "skip_count": 0.0, "step": 214, "text_loss": 0.24692800641059875 @@ -2050,13 +2050,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.00043, - "loss": 0.1011, + "loss": 0.1005, "macro_f1": 0.3272727429866791, "num_tokens": 348700.0, "repeat_count": 1.0, - "routers_loss": 0.06956391036510468, + "routers_loss": 0.06414655596017838, "skip_count": 0.0, "step": 216, "text_loss": 0.4744548797607422 @@ -2069,13 +2069,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1962890625, "learning_rate": 0.00043400000000000003, - "loss": 0.076, + "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 351507.0, "repeat_count": 1.0, - "routers_loss": 0.1140352189540863, + "routers_loss": 0.11702914535999298, "skip_count": 1.0, "step": 218, "text_loss": 0.5614864826202393 @@ -2090,11 +2090,11 @@ "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.000438, - "loss": 0.0788, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 354484.0, "repeat_count": 0.0, - "routers_loss": 0.011621571145951748, + "routers_loss": 0.014991643838584423, "skip_count": 0.0, "step": 220, "text_loss": 0.47209832072257996 @@ -2107,13 +2107,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.251953125, "learning_rate": 0.000442, "loss": 0.106, "macro_f1": 0.3272727429866791, "num_tokens": 357954.0, "repeat_count": 0.0, - "routers_loss": 0.05813701078295708, + "routers_loss": 0.04747112840414047, "skip_count": 1.0, "step": 222, "text_loss": 0.2968728244304657 @@ -2126,13 +2126,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.40234375, "learning_rate": 0.000446, - "loss": 0.0827, + "loss": 0.0853, "macro_f1": 0.32098764181137085, "num_tokens": 360547.0, "repeat_count": 0.0, - "routers_loss": 0.0646885335445404, + "routers_loss": 0.06754162162542343, "skip_count": 2.0, "step": 224, "text_loss": 0.2364148646593094 @@ -2145,13 +2145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.2412109375, "learning_rate": 0.00045000000000000004, - "loss": 0.1011, + "loss": 0.1016, "macro_f1": 0.3272727429866791, "num_tokens": 364529.0, "repeat_count": 0.0, - "routers_loss": 0.07224348932504654, + "routers_loss": 0.07830183953046799, "skip_count": 1.0, "step": 226, "text_loss": 0.4787476360797882 @@ -2164,13 +2164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1953125, "learning_rate": 0.00045400000000000003, - "loss": 0.0781, + "loss": 0.0792, "macro_f1": 0.3333333432674408, "num_tokens": 367683.0, "repeat_count": 0.0, - "routers_loss": 0.015971746295690536, + "routers_loss": 0.015735948458313942, "skip_count": 0.0, "step": 228, "text_loss": 0.37148505449295044 @@ -2183,13 +2183,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.25, "learning_rate": 0.000458, - "loss": 0.099, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 371402.0, "repeat_count": 0.0, - "routers_loss": 0.017818331718444824, + "routers_loss": 0.013354359194636345, "skip_count": 0.0, "step": 230, "text_loss": 0.7464763522148132 @@ -2202,13 +2202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1494140625, "learning_rate": 0.000462, - "loss": 0.0757, + "loss": 0.0731, "macro_f1": 0.3333333432674408, "num_tokens": 374587.0, "repeat_count": 0.0, - "routers_loss": 0.01582280732691288, + "routers_loss": 0.013763721100986004, "skip_count": 0.0, "step": 232, "text_loss": 0.8754443526268005 @@ -2221,13 +2221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.42578125, + "grad_norm": 0.3984375, "learning_rate": 0.00046600000000000005, - "loss": 0.0876, + "loss": 0.0861, "macro_f1": 0.3333333432674408, "num_tokens": 377513.0, "repeat_count": 0.0, - "routers_loss": 0.011417915113270283, + "routers_loss": 0.010075435042381287, "skip_count": 0.0, "step": 234, "text_loss": 0.31534913182258606 @@ -2240,13 +2240,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.17578125, "learning_rate": 0.00047, - "loss": 0.0801, + "loss": 0.0791, "macro_f1": 0.3272727429866791, "num_tokens": 380736.0, "repeat_count": 0.0, - "routers_loss": 0.05787832289934158, + "routers_loss": 0.059825167059898376, "skip_count": 1.0, "step": 236, "text_loss": 0.5936337113380432 @@ -2259,13 +2259,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.267578125, "learning_rate": 0.000474, - "loss": 0.0508, + "loss": 0.0514, "macro_f1": 0.32098764181137085, "num_tokens": 383236.0, "repeat_count": 0.0, - "routers_loss": 0.09476690739393234, + "routers_loss": 0.09134846180677414, "skip_count": 2.0, "step": 238, "text_loss": 0.5976157784461975 @@ -2278,13 +2278,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.208984375, "learning_rate": 0.00047799999999999996, - "loss": 0.0833, + "loss": 0.0858, "macro_f1": 0.32098764181137085, "num_tokens": 385778.0, "repeat_count": 1.0, - "routers_loss": 0.1099705696105957, + "routers_loss": 0.11989791691303253, "skip_count": 1.0, "step": 240, "text_loss": 0.3554210960865021 @@ -2297,13 +2297,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.171875, "learning_rate": 0.000482, - "loss": 0.0745, + "loss": 0.0734, "macro_f1": 0.3333333432674408, "num_tokens": 388777.0, "repeat_count": 0.0, - "routers_loss": 0.01269970741122961, + "routers_loss": 0.013591105118393898, "skip_count": 0.0, "step": 242, "text_loss": 0.4829460382461548 @@ -2316,13 +2316,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.12060546875, "learning_rate": 0.000486, - "loss": 0.061, + "loss": 0.0625, "macro_f1": 0.32098764181137085, "num_tokens": 391797.0, "repeat_count": 0.0, - "routers_loss": 0.08505752682685852, + "routers_loss": 0.0920003354549408, "skip_count": 2.0, "step": 244, "text_loss": 0.3085818886756897 @@ -2335,13 +2335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1552734375, "learning_rate": 0.00049, - "loss": 0.0504, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 396485.0, "repeat_count": 0.0, - "routers_loss": 0.012750142253935337, + "routers_loss": 0.0129330949857831, "skip_count": 0.0, "step": 246, "text_loss": 0.42803969979286194 @@ -2354,13 +2354,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.291015625, + "grad_norm": 0.296875, "learning_rate": 0.000494, - "loss": 0.0962, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 399923.0, "repeat_count": 0.0, - "routers_loss": 0.11287309974431992, + "routers_loss": 0.10677755624055862, "skip_count": 3.0, "step": 248, "text_loss": 0.2908555567264557 @@ -2373,32 +2373,32 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.203125, "learning_rate": 0.000498, - "loss": 0.0821, + "loss": 0.0812, "macro_f1": 0.3144654333591461, "num_tokens": 403647.0, "repeat_count": 0.0, - "routers_loss": 0.1486474722623825, + "routers_loss": 0.1504337340593338, "skip_count": 3.0, "step": 250, "text_loss": 0.333095908164978 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.183152333431171, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, + "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0005020000000000001, - "loss": 0.0832, - "macro_f1": 0.5492662787437439, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, "num_tokens": 409147.0, "repeat_count": 0.0, - "routers_loss": 0.06636594980955124, + "routers_loss": 0.06503184884786606, "skip_count": 2.0, "step": 252, "text_loss": 0.16117942333221436 @@ -2411,13 +2411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.267578125, + "grad_norm": 0.287109375, "learning_rate": 0.000506, - "loss": 0.1, + "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 412072.0, "repeat_count": 0.0, - "routers_loss": 0.015062150545418262, + "routers_loss": 0.016280122101306915, "skip_count": 0.0, "step": 254, "text_loss": 0.4217492640018463 @@ -2430,13 +2430,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.21484375, "learning_rate": 0.00051, - "loss": 0.0808, + "loss": 0.0803, "macro_f1": 0.3144654333591461, "num_tokens": 415052.0, "repeat_count": 2.0, - "routers_loss": 0.2051105946302414, + "routers_loss": 0.2117508500814438, "skip_count": 1.0, "step": 256, "text_loss": 0.5795308947563171 @@ -2449,13 +2449,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "grad_norm": 0.2421875, "learning_rate": 0.000514, - "loss": 0.068, + "loss": 0.0668, "macro_f1": 0.3272727429866791, "num_tokens": 418099.0, "repeat_count": 1.0, - "routers_loss": 0.1467045396566391, + "routers_loss": 0.15002092719078064, "skip_count": 0.0, "step": 258, "text_loss": 0.4840938448905945 @@ -2468,13 +2468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1533203125, "learning_rate": 0.000518, - "loss": 0.0543, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 422526.0, "repeat_count": 0.0, - "routers_loss": 0.013022038154304028, + "routers_loss": 0.012834074907004833, "skip_count": 0.0, "step": 260, "text_loss": 0.36141225695610046 @@ -2487,13 +2487,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.2294921875, "learning_rate": 0.000522, - "loss": 0.0848, + "loss": 0.085, "macro_f1": 0.3076923191547394, "num_tokens": 425765.0, "repeat_count": 2.0, - "routers_loss": 0.2575930058956146, + "routers_loss": 0.23808011412620544, "skip_count": 2.0, "step": 262, "text_loss": 0.27572691440582275 @@ -2506,13 +2506,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.17578125, "learning_rate": 0.000526, - "loss": 0.07, + "loss": 0.0708, "macro_f1": 0.3272727429866791, "num_tokens": 429048.0, "repeat_count": 0.0, - "routers_loss": 0.0558602549135685, + "routers_loss": 0.055687375366687775, "skip_count": 1.0, "step": 264, "text_loss": 0.37020301818847656 @@ -2525,13 +2525,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, - "loss": 0.082, + "loss": 0.0839, "macro_f1": 0.3272727429866791, "num_tokens": 431784.0, "repeat_count": 0.0, - "routers_loss": 0.09126655012369156, + "routers_loss": 0.0872957780957222, "skip_count": 1.0, "step": 266, "text_loss": 0.5937283039093018 @@ -2544,13 +2544,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.263671875, "learning_rate": 0.0005340000000000001, - "loss": 0.0764, + "loss": 0.0733, "macro_f1": 0.32098764181137085, "num_tokens": 434297.0, "repeat_count": 2.0, - "routers_loss": 0.24805288016796112, + "routers_loss": 0.23507654666900635, "skip_count": 0.0, "step": 268, "text_loss": 0.3367372453212738 @@ -2563,13 +2563,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.2431640625, "learning_rate": 0.0005380000000000001, - "loss": 0.0686, + "loss": 0.0708, "macro_f1": 0.32098764181137085, "num_tokens": 437586.0, "repeat_count": 0.0, - "routers_loss": 0.13135533034801483, + "routers_loss": 0.12860390543937683, "skip_count": 2.0, "step": 270, "text_loss": 0.7149854302406311 @@ -2582,13 +2582,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0005420000000000001, - "loss": 0.1083, + "loss": 0.1072, "macro_f1": 0.3272727429866791, "num_tokens": 440649.0, "repeat_count": 0.0, - "routers_loss": 0.04991440102458, + "routers_loss": 0.044308312237262726, "skip_count": 1.0, "step": 272, "text_loss": 0.26778292655944824 @@ -2601,13 +2601,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.455078125, + "grad_norm": 0.44921875, "learning_rate": 0.000546, - "loss": 0.0991, + "loss": 0.0938, "macro_f1": 0.3144654333591461, "num_tokens": 443907.0, "repeat_count": 0.0, - "routers_loss": 0.12236632406711578, + "routers_loss": 0.11514109373092651, "skip_count": 3.0, "step": 274, "text_loss": 0.23578761518001556 @@ -2620,13 +2620,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25, + "grad_norm": 0.2578125, "learning_rate": 0.00055, - "loss": 0.0936, + "loss": 0.0932, "macro_f1": 0.5492662787437439, "num_tokens": 447147.0, "repeat_count": 0.0, - "routers_loss": 0.053506772965192795, + "routers_loss": 0.055705297738313675, "skip_count": 2.0, "step": 276, "text_loss": 0.2513524889945984 @@ -2639,13 +2639,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.29296875, "learning_rate": 0.000554, - "loss": 0.066, + "loss": 0.0667, "macro_f1": 0.32098764181137085, "num_tokens": 450032.0, "repeat_count": 0.0, - "routers_loss": 0.13446088135242462, + "routers_loss": 0.13778971135616302, "skip_count": 2.0, "step": 278, "text_loss": 0.4857243597507477 @@ -2658,32 +2658,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.185546875, "learning_rate": 0.000558, - "loss": 0.0682, + "loss": 0.0672, "macro_f1": 0.3272727429866791, "num_tokens": 453195.0, "repeat_count": 1.0, - "routers_loss": 0.07270720601081848, + "routers_loss": 0.0700262188911438, "skip_count": 0.0, "step": 280, "text_loss": 0.7589789628982544 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 1.3240387437628411, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.28125, + "f1_skip": 0.0, + "grad_norm": 0.25, "learning_rate": 0.0005620000000000001, - "loss": 0.0648, - "macro_f1": 0.5427350401878357, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, "num_tokens": 455942.0, "repeat_count": 1.0, - "routers_loss": 0.13866399228572845, + "routers_loss": 0.11706235259771347, "skip_count": 2.0, "step": 282, "text_loss": 0.4783432185649872 @@ -2696,13 +2696,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.236328125, + "grad_norm": 0.265625, "learning_rate": 0.000566, - "loss": 0.0782, + "loss": 0.0793, "macro_f1": 0.3272727429866791, "num_tokens": 458932.0, "repeat_count": 0.0, - "routers_loss": 0.0645354762673378, + "routers_loss": 0.07073967158794403, "skip_count": 1.0, "step": 284, "text_loss": 0.7117193937301636 @@ -2715,13 +2715,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1650390625, "learning_rate": 0.00057, - "loss": 0.0892, + "loss": 0.0915, "macro_f1": 0.3272727429866791, "num_tokens": 462650.0, "repeat_count": 0.0, - "routers_loss": 0.05967628210783005, + "routers_loss": 0.05301115661859512, "skip_count": 1.0, "step": 286, "text_loss": 0.4175460636615753 @@ -2734,13 +2734,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2158203125, "learning_rate": 0.000574, - "loss": 0.0676, + "loss": 0.0675, "macro_f1": 0.3272727429866791, "num_tokens": 466290.0, "repeat_count": 0.0, - "routers_loss": 0.06438407301902771, + "routers_loss": 0.06356479972600937, "skip_count": 1.0, "step": 288, "text_loss": 0.5832946300506592 @@ -2753,13 +2753,13 @@ "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28515625, "learning_rate": 0.000578, - "loss": 0.0781, + "loss": 0.0805, "macro_f1": 0.3006536066532135, "num_tokens": 469296.0, "repeat_count": 1.0, - "routers_loss": 0.21225209534168243, + "routers_loss": 0.21032999455928802, "skip_count": 3.0, "step": 290, "text_loss": 0.36023473739624023 @@ -2772,13 +2772,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.244140625, + "grad_norm": 0.27734375, "learning_rate": 0.0005819999999999999, - "loss": 0.0664, + "loss": 0.0685, "macro_f1": 0.32098764181137085, "num_tokens": 472272.0, "repeat_count": 1.0, - "routers_loss": 0.08085516840219498, + "routers_loss": 0.08062280714511871, "skip_count": 1.0, "step": 292, "text_loss": 0.37197956442832947 @@ -2791,13 +2791,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.28125, "learning_rate": 0.0005859999999999999, - "loss": 0.0874, + "loss": 0.0878, "macro_f1": 0.32098764181137085, "num_tokens": 475864.0, "repeat_count": 0.0, - "routers_loss": 0.05378658324480057, + "routers_loss": 0.05023600533604622, "skip_count": 2.0, "step": 294, "text_loss": 0.4765273630619049 @@ -2810,13 +2810,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.2177734375, "learning_rate": 0.00059, - "loss": 0.0715, + "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 478916.0, "repeat_count": 0.0, - "routers_loss": 0.01145261898636818, + "routers_loss": 0.011689410544931889, "skip_count": 0.0, "step": 296, "text_loss": 0.5878773927688599 @@ -2831,11 +2831,11 @@ "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.000594, - "loss": 0.0737, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 482369.0, "repeat_count": 0.0, - "routers_loss": 0.009397956542670727, + "routers_loss": 0.010772093199193478, "skip_count": 0.0, "step": 298, "text_loss": 0.4424116313457489 @@ -2848,13 +2848,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.181640625, "learning_rate": 0.000598, - "loss": 0.0802, + "loss": 0.0787, "macro_f1": 0.3076923191547394, "num_tokens": 486049.0, "repeat_count": 2.0, - "routers_loss": 0.2389357089996338, + "routers_loss": 0.23482851684093475, "skip_count": 2.0, "step": 300, "text_loss": 0.21217775344848633 @@ -2862,18 +2862,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 1.417963017317288, - "f1_execute": 0.9019607901573181, + "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2080078125, "learning_rate": 0.000602, - "loss": 0.0745, - "macro_f1": 0.3006536066532135, + "loss": 0.073, + "macro_f1": 0.3076923191547394, "num_tokens": 488683.0, "repeat_count": 1.0, - "routers_loss": 0.18252353370189667, + "routers_loss": 0.18843084573745728, "skip_count": 3.0, "step": 302, "text_loss": 0.2109498232603073 @@ -2886,13 +2886,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.27734375, + "grad_norm": 0.279296875, "learning_rate": 0.000606, - "loss": 0.0935, + "loss": 0.0945, "macro_f1": 0.3144654333591461, "num_tokens": 492010.0, "repeat_count": 0.0, - "routers_loss": 0.18185268342494965, + "routers_loss": 0.17861786484718323, "skip_count": 3.0, "step": 304, "text_loss": 0.8446305394172668 @@ -2905,13 +2905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1943359375, "learning_rate": 0.00061, - "loss": 0.0853, + "loss": 0.0827, "macro_f1": 0.3333333432674408, "num_tokens": 494764.0, "repeat_count": 0.0, - "routers_loss": 0.013210167177021503, + "routers_loss": 0.014124520123004913, "skip_count": 0.0, "step": 306, "text_loss": 0.742735743522644 @@ -2924,13 +2924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.26953125, "learning_rate": 0.000614, - "loss": 0.1089, + "loss": 0.1071, "macro_f1": 0.3333333432674408, "num_tokens": 497820.0, "repeat_count": 0.0, - "routers_loss": 0.016936838626861572, + "routers_loss": 0.017968112602829933, "skip_count": 0.0, "step": 308, "text_loss": 0.28305482864379883 @@ -2943,13 +2943,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.1689453125, "learning_rate": 0.0006180000000000001, - "loss": 0.077, + "loss": 0.0775, "macro_f1": 0.32098764181137085, "num_tokens": 500694.0, "repeat_count": 0.0, - "routers_loss": 0.08630389720201492, + "routers_loss": 0.08593655377626419, "skip_count": 2.0, "step": 310, "text_loss": 0.3496848940849304 @@ -2962,13 +2962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.19140625, "learning_rate": 0.000622, - "loss": 0.0602, + "loss": 0.061, "macro_f1": 0.3333333432674408, "num_tokens": 503871.0, "repeat_count": 0.0, - "routers_loss": 0.013665963895618916, + "routers_loss": 0.016449492424726486, "skip_count": 0.0, "step": 312, "text_loss": 0.6691372990608215 @@ -2981,13 +2981,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.205078125, "learning_rate": 0.000626, - "loss": 0.0794, + "loss": 0.0815, "macro_f1": 0.3333333432674408, "num_tokens": 506730.0, "repeat_count": 0.0, - "routers_loss": 0.01584783010184765, + "routers_loss": 0.014532964676618576, "skip_count": 0.0, "step": 314, "text_loss": 0.6118118166923523 @@ -3000,13 +3000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2216796875, "learning_rate": 0.00063, - "loss": 0.0762, + "loss": 0.0742, "macro_f1": 0.3333333432674408, "num_tokens": 510323.0, "repeat_count": 0.0, - "routers_loss": 0.01368923019617796, + "routers_loss": 0.013093139044940472, "skip_count": 0.0, "step": 316, "text_loss": 0.38126271963119507 @@ -3019,13 +3019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.388671875, + "grad_norm": 0.400390625, "learning_rate": 0.000634, - "loss": 0.0908, + "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 514075.0, "repeat_count": 0.0, - "routers_loss": 0.009135022759437561, + "routers_loss": 0.008627045899629593, "skip_count": 0.0, "step": 318, "text_loss": 0.5983037948608398 @@ -3038,13 +3038,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.15234375, "learning_rate": 0.000638, - "loss": 0.0949, + "loss": 0.1008, "macro_f1": 0.3272727429866791, "num_tokens": 517418.0, "repeat_count": 0.0, - "routers_loss": 0.046641621738672256, + "routers_loss": 0.04561378434300423, "skip_count": 1.0, "step": 320, "text_loss": 0.767257034778595 @@ -3052,18 +3052,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.5118872908717347, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.259765625, "learning_rate": 0.000642, - "loss": 0.0925, - "macro_f1": 0.3333333432674408, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, "num_tokens": 520443.0, "repeat_count": 0.0, - "routers_loss": 0.020637936890125275, + "routers_loss": 0.024372953921556473, "skip_count": 0.0, "step": 322, "text_loss": 0.6572105884552002 @@ -3076,13 +3076,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26953125, + "grad_norm": 0.30078125, "learning_rate": 0.000646, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 523317.0, "repeat_count": 1.0, - "routers_loss": 0.08289298415184021, + "routers_loss": 0.08099937438964844, "skip_count": 0.0, "step": 324, "text_loss": 0.205499529838562 @@ -3090,18 +3090,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.530672145582624, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23828125, + "grad_norm": 0.2294921875, "learning_rate": 0.0006500000000000001, - "loss": 0.0823, - "macro_f1": 0.3272727429866791, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, "num_tokens": 526355.0, "repeat_count": 0.0, - "routers_loss": 0.06960040330886841, + "routers_loss": 0.0657225176692009, "skip_count": 1.0, "step": 326, "text_loss": 0.2587239742279053 @@ -3114,13 +3114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.111328125, "learning_rate": 0.0006540000000000001, - "loss": 0.0799, + "loss": 0.0779, "macro_f1": 0.3333333432674408, "num_tokens": 529689.0, "repeat_count": 0.0, - "routers_loss": 0.02087482251226902, + "routers_loss": 0.01849208027124405, "skip_count": 0.0, "step": 328, "text_loss": 0.2172023057937622 @@ -3133,13 +3133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.1845703125, "learning_rate": 0.0006580000000000001, - "loss": 0.0757, + "loss": 0.0758, "macro_f1": 0.3333333432674408, "num_tokens": 532603.0, "repeat_count": 0.0, - "routers_loss": 0.016592051833868027, + "routers_loss": 0.016184113919734955, "skip_count": 0.0, "step": 330, "text_loss": 0.5980568528175354 @@ -3152,32 +3152,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.220703125, "learning_rate": 0.000662, - "loss": 0.0438, + "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 536056.0, "repeat_count": 0.0, - "routers_loss": 0.012950568459928036, + "routers_loss": 0.01303898449987173, "skip_count": 0.0, "step": 332, "text_loss": 0.5421966314315796 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 1.5682418550044028, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.296875, "learning_rate": 0.000666, - "loss": 0.0964, - "macro_f1": 0.29333335161209106, + "loss": 0.0963, + "macro_f1": 0.465986430644989, "num_tokens": 539231.0, "repeat_count": 3.0, - "routers_loss": 0.3373340964317322, + "routers_loss": 0.3075675964355469, "skip_count": 3.0, "step": 334, "text_loss": 0.19719554483890533 @@ -3190,13 +3190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.173828125, "learning_rate": 0.00067, "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 542038.0, "repeat_count": 0.0, - "routers_loss": 0.008110735565423965, + "routers_loss": 0.009116224013268948, "skip_count": 0.0, "step": 336, "text_loss": 0.3407036066055298 @@ -3209,13 +3209,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.2421875, "learning_rate": 0.000674, - "loss": 0.0771, + "loss": 0.0768, "macro_f1": 0.3333333432674408, "num_tokens": 545019.0, "repeat_count": 0.0, - "routers_loss": 0.01841609925031662, + "routers_loss": 0.021463042125105858, "skip_count": 0.0, "step": 338, "text_loss": 0.24486012756824493 @@ -3228,13 +3228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1708984375, "learning_rate": 0.0006780000000000001, - "loss": 0.0894, + "loss": 0.0889, "macro_f1": 0.3333333432674408, "num_tokens": 548036.0, "repeat_count": 0.0, - "routers_loss": 0.01612614095211029, + "routers_loss": 0.01857556402683258, "skip_count": 0.0, "step": 340, "text_loss": 0.28140124678611755 @@ -3247,13 +3247,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.130859375, "learning_rate": 0.0006820000000000001, - "loss": 0.0611, + "loss": 0.0617, "macro_f1": 0.3006536364555359, "num_tokens": 551419.0, "repeat_count": 2.0, - "routers_loss": 0.26202192902565, + "routers_loss": 0.27090007066726685, "skip_count": 3.0, "step": 342, "text_loss": 0.20690307021141052 @@ -3266,13 +3266,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.3046875, "learning_rate": 0.0006860000000000001, - "loss": 0.1013, + "loss": 0.1047, "macro_f1": 0.32098764181137085, "num_tokens": 554037.0, "repeat_count": 0.0, - "routers_loss": 0.09235779196023941, + "routers_loss": 0.09231195598840714, "skip_count": 2.0, "step": 344, "text_loss": 0.4479128420352936 @@ -3285,13 +3285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.255859375, "learning_rate": 0.00069, - "loss": 0.0856, + "loss": 0.0883, "macro_f1": 0.3333333432674408, "num_tokens": 556672.0, "repeat_count": 0.0, - "routers_loss": 0.010735333897173405, + "routers_loss": 0.00935924518853426, "skip_count": 0.0, "step": 346, "text_loss": 0.6377320289611816 @@ -3304,13 +3304,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2138671875, "learning_rate": 0.000694, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098764181137085, "num_tokens": 559756.0, "repeat_count": 0.0, - "routers_loss": 0.14742356538772583, + "routers_loss": 0.17641772329807281, "skip_count": 2.0, "step": 348, "text_loss": 0.6097636222839355 @@ -3323,13 +3323,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30859375, + "grad_norm": 0.30078125, "learning_rate": 0.0006979999999999999, - "loss": 0.0614, + "loss": 0.0616, "macro_f1": 0.5492662787437439, "num_tokens": 563415.0, "repeat_count": 0.0, - "routers_loss": 0.06606879830360413, + "routers_loss": 0.06240406632423401, "skip_count": 2.0, "step": 350, "text_loss": 0.5291631817817688 @@ -3342,13 +3342,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.322265625, + "grad_norm": 0.296875, "learning_rate": 0.0007019999999999999, - "loss": 0.1033, + "loss": 0.1026, "macro_f1": 0.3333333432674408, "num_tokens": 566357.0, "repeat_count": 0.0, - "routers_loss": 0.012873432599008083, + "routers_loss": 0.012269247323274612, "skip_count": 0.0, "step": 352, "text_loss": 0.5170195698738098 @@ -3361,13 +3361,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.1435546875, "learning_rate": 0.0007059999999999999, - "loss": 0.0819, + "loss": 0.0815, "macro_f1": 0.32098764181137085, "num_tokens": 569449.0, "repeat_count": 0.0, - "routers_loss": 0.07853665202856064, + "routers_loss": 0.07515309751033783, "skip_count": 2.0, "step": 354, "text_loss": 0.34507250785827637 @@ -3380,13 +3380,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.251953125, + "grad_norm": 0.263671875, "learning_rate": 0.00071, - "loss": 0.0804, + "loss": 0.0791, "macro_f1": 0.3144654333591461, "num_tokens": 572761.0, "repeat_count": 1.0, - "routers_loss": 0.2216549813747406, + "routers_loss": 0.20768006145954132, "skip_count": 2.0, "step": 356, "text_loss": 0.3158532381057739 @@ -3399,13 +3399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1884765625, "learning_rate": 0.000714, - "loss": 0.0675, + "loss": 0.0682, "macro_f1": 0.3333333432674408, "num_tokens": 575909.0, "repeat_count": 0.0, - "routers_loss": 0.02423691377043724, + "routers_loss": 0.025329967960715294, "skip_count": 0.0, "step": 358, "text_loss": 0.21455390751361847 @@ -3413,18 +3413,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 1.6903434106251836, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.21484375, "learning_rate": 0.000718, - "loss": 0.0781, - "macro_f1": 0.3272727429866791, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, "num_tokens": 579186.0, "repeat_count": 1.0, - "routers_loss": 0.07496294379234314, + "routers_loss": 0.07676175981760025, "skip_count": 0.0, "step": 360, "text_loss": 0.61895352602005 @@ -3437,13 +3437,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2138671875, + "grad_norm": 0.197265625, "learning_rate": 0.000722, - "loss": 0.0778, + "loss": 0.0781, "macro_f1": 0.32098767161369324, "num_tokens": 582437.0, "repeat_count": 0.0, - "routers_loss": 0.08181872963905334, + "routers_loss": 0.08070661872625351, "skip_count": 1.0, "step": 362, "text_loss": 0.20557661354541779 @@ -3456,13 +3456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2216796875, "learning_rate": 0.000726, - "loss": 0.1112, + "loss": 0.11, "macro_f1": 0.3333333432674408, "num_tokens": 586096.0, "repeat_count": 0.0, - "routers_loss": 0.016959719359874725, + "routers_loss": 0.015891313552856445, "skip_count": 0.0, "step": 364, "text_loss": 0.597991943359375 @@ -3475,13 +3475,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.15625, "learning_rate": 0.00073, - "loss": 0.0577, + "loss": 0.0573, "macro_f1": 0.3076923191547394, "num_tokens": 589520.0, "repeat_count": 1.0, - "routers_loss": 0.13295969367027283, + "routers_loss": 0.12844261527061462, "skip_count": 3.0, "step": 366, "text_loss": 0.2944789230823517 @@ -3494,13 +3494,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.150390625, "learning_rate": 0.000734, - "loss": 0.0986, + "loss": 0.1005, "macro_f1": 0.3333333432674408, "num_tokens": 592691.0, "repeat_count": 0.0, - "routers_loss": 0.02476893551647663, + "routers_loss": 0.02382199838757515, "skip_count": 0.0, "step": 368, "text_loss": 0.23989969491958618 @@ -3513,13 +3513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1796875, "learning_rate": 0.000738, - "loss": 0.0682, + "loss": 0.0661, "macro_f1": 0.3333333432674408, "num_tokens": 596004.0, "repeat_count": 0.0, - "routers_loss": 0.019863395020365715, + "routers_loss": 0.018812084570527077, "skip_count": 0.0, "step": 370, "text_loss": 0.22111408412456512 @@ -3532,13 +3532,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2412109375, "learning_rate": 0.000742, - "loss": 0.0663, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 599087.0, "repeat_count": 0.0, - "routers_loss": 0.07230417430400848, + "routers_loss": 0.08290331065654755, "skip_count": 1.0, "step": 372, "text_loss": 0.2567356526851654 @@ -3551,13 +3551,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.2412109375, "learning_rate": 0.000746, - "loss": 0.0986, + "loss": 0.0941, "macro_f1": 0.32098764181137085, "num_tokens": 602330.0, "repeat_count": 1.0, - "routers_loss": 0.11727793514728546, + "routers_loss": 0.11482042074203491, "skip_count": 1.0, "step": 374, "text_loss": 0.7217292785644531 @@ -3570,13 +3570,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.2265625, "learning_rate": 0.00075, - "loss": 0.0724, + "loss": 0.0728, "macro_f1": 0.3272727429866791, "num_tokens": 605503.0, "repeat_count": 1.0, - "routers_loss": 0.13495951890945435, + "routers_loss": 0.11849870532751083, "skip_count": 0.0, "step": 376, "text_loss": 0.5122153759002686 @@ -3589,13 +3589,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.23046875, + "grad_norm": 0.2333984375, "learning_rate": 0.000754, - "loss": 0.0823, + "loss": 0.0835, "macro_f1": 0.32098767161369324, "num_tokens": 608505.0, "repeat_count": 0.0, - "routers_loss": 0.07612533867359161, + "routers_loss": 0.07090992480516434, "skip_count": 1.0, "step": 378, "text_loss": 0.2204965502023697 @@ -3608,13 +3608,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1826171875, "learning_rate": 0.000758, - "loss": 0.0803, + "loss": 0.0794, "macro_f1": 0.3272727429866791, "num_tokens": 611193.0, "repeat_count": 0.0, - "routers_loss": 0.0484120175242424, + "routers_loss": 0.03812089189887047, "skip_count": 1.0, "step": 380, "text_loss": 0.44909021258354187 @@ -3627,13 +3627,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1689453125, "learning_rate": 0.000762, - "loss": 0.0866, + "loss": 0.0882, "macro_f1": 0.3272727429866791, "num_tokens": 614231.0, "repeat_count": 1.0, - "routers_loss": 0.10939671844244003, + "routers_loss": 0.10270529240369797, "skip_count": 0.0, "step": 382, "text_loss": 0.13624964654445648 @@ -3646,13 +3646,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.330078125, "learning_rate": 0.0007660000000000001, - "loss": 0.1083, + "loss": 0.1107, "macro_f1": 0.32098764181137085, "num_tokens": 617090.0, "repeat_count": 1.0, - "routers_loss": 0.11382336914539337, + "routers_loss": 0.11624004691839218, "skip_count": 1.0, "step": 384, "text_loss": 0.7314052581787109 @@ -3667,11 +3667,11 @@ "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007700000000000001, - "loss": 0.0616, + "loss": 0.0628, "macro_f1": 0.32098764181137085, "num_tokens": 620596.0, "repeat_count": 0.0, - "routers_loss": 0.07494530081748962, + "routers_loss": 0.07114322483539581, "skip_count": 2.0, "step": 386, "text_loss": 0.503322958946228 @@ -3684,13 +3684,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.298828125, + "grad_norm": 0.306640625, "learning_rate": 0.0007740000000000001, - "loss": 0.0816, + "loss": 0.0829, "macro_f1": 0.32098764181137085, "num_tokens": 624108.0, "repeat_count": 0.0, - "routers_loss": 0.05718417093157768, + "routers_loss": 0.06061873584985733, "skip_count": 2.0, "step": 388, "text_loss": 0.11481904983520508 @@ -3703,13 +3703,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.2099609375, "learning_rate": 0.000778, - "loss": 0.0783, + "loss": 0.0791, "macro_f1": 0.3006536364555359, "num_tokens": 626895.0, "repeat_count": 1.0, - "routers_loss": 0.2848989963531494, + "routers_loss": 0.2921771705150604, "skip_count": 4.0, "step": 390, "text_loss": 0.3069624602794647 @@ -3722,13 +3722,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.30078125, + "grad_norm": 0.30859375, "learning_rate": 0.000782, - "loss": 0.0608, + "loss": 0.0605, "macro_f1": 0.3076923191547394, "num_tokens": 630204.0, "repeat_count": 0.0, - "routers_loss": 0.2050076276063919, + "routers_loss": 0.202707901597023, "skip_count": 4.0, "step": 392, "text_loss": 0.6022785305976868 @@ -3741,13 +3741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.29296875, "learning_rate": 0.000786, - "loss": 0.0863, + "loss": 0.0877, "macro_f1": 0.3333333432674408, "num_tokens": 634373.0, "repeat_count": 0.0, - "routers_loss": 0.020946886390447617, + "routers_loss": 0.0221510399132967, "skip_count": 0.0, "step": 394, "text_loss": 0.26787394285202026 @@ -3760,13 +3760,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.376953125, + "grad_norm": 0.37890625, "learning_rate": 0.00079, - "loss": 0.0798, + "loss": 0.0805, "macro_f1": 0.32098764181137085, "num_tokens": 637442.0, "repeat_count": 2.0, - "routers_loss": 0.1270289123058319, + "routers_loss": 0.12636390328407288, "skip_count": 0.0, "step": 396, "text_loss": 0.2799781560897827 @@ -3779,13 +3779,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.2080078125, "learning_rate": 0.0007940000000000001, - "loss": 0.0701, + "loss": 0.0724, "macro_f1": 0.32098764181137085, "num_tokens": 641231.0, "repeat_count": 0.0, - "routers_loss": 0.08012636005878448, + "routers_loss": 0.07933453470468521, "skip_count": 2.0, "step": 398, "text_loss": 0.2507784366607666 @@ -3798,13 +3798,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0007980000000000001, - "loss": 0.0901, + "loss": 0.0909, "macro_f1": 0.3272727429866791, "num_tokens": 644560.0, "repeat_count": 1.0, - "routers_loss": 0.09315784275531769, + "routers_loss": 0.10324911028146744, "skip_count": 0.0, "step": 400, "text_loss": 0.7756280303001404 @@ -3817,13 +3817,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2275390625, "learning_rate": 0.0008020000000000001, - "loss": 0.078, + "loss": 0.0783, "macro_f1": 0.3144654333591461, "num_tokens": 647393.0, "repeat_count": 1.0, - "routers_loss": 0.18492189049720764, + "routers_loss": 0.18546262383460999, "skip_count": 2.0, "step": 402, "text_loss": 0.5013328194618225 @@ -3836,13 +3836,13 @@ "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.283203125, "learning_rate": 0.0008060000000000001, - "loss": 0.0801, + "loss": 0.0787, "macro_f1": 0.2857142984867096, "num_tokens": 650355.0, "repeat_count": 3.0, - "routers_loss": 0.32641324400901794, + "routers_loss": 0.3280293643474579, "skip_count": 4.0, "step": 404, "text_loss": 0.2842077314853668 @@ -3855,13 +3855,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2138671875, "learning_rate": 0.0008100000000000001, - "loss": 0.0905, + "loss": 0.0901, "macro_f1": 0.3333333432674408, "num_tokens": 654280.0, "repeat_count": 0.0, - "routers_loss": 0.02722037397325039, + "routers_loss": 0.02623247355222702, "skip_count": 0.0, "step": 406, "text_loss": 0.46742817759513855 @@ -3874,13 +3874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.216796875, "learning_rate": 0.0008139999999999999, - "loss": 0.0958, + "loss": 0.0945, "macro_f1": 0.3333333432674408, "num_tokens": 657568.0, "repeat_count": 0.0, - "routers_loss": 0.010129833593964577, + "routers_loss": 0.009744114242494106, "skip_count": 0.0, "step": 408, "text_loss": 0.7168047428131104 @@ -3893,13 +3893,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2373046875, + "grad_norm": 0.2158203125, "learning_rate": 0.0008179999999999999, - "loss": 0.1084, + "loss": 0.1065, "macro_f1": 0.32098764181137085, "num_tokens": 660593.0, "repeat_count": 0.0, - "routers_loss": 0.07298308610916138, + "routers_loss": 0.07591600716114044, "skip_count": 2.0, "step": 410, "text_loss": 0.449823260307312 @@ -3912,13 +3912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1396484375, "learning_rate": 0.0008219999999999999, - "loss": 0.0802, + "loss": 0.0795, "macro_f1": 0.3333333432674408, "num_tokens": 663916.0, "repeat_count": 0.0, - "routers_loss": 0.024257874116301537, + "routers_loss": 0.02076602540910244, "skip_count": 0.0, "step": 412, "text_loss": 0.4764713943004608 @@ -3931,13 +3931,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.1650390625, "learning_rate": 0.000826, - "loss": 0.0842, + "loss": 0.0836, "macro_f1": 0.3272727429866791, "num_tokens": 667502.0, "repeat_count": 0.0, - "routers_loss": 0.048864223062992096, + "routers_loss": 0.049170155078172684, "skip_count": 1.0, "step": 414, "text_loss": 0.30333325266838074 @@ -3950,13 +3950,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1513671875, "learning_rate": 0.00083, - "loss": 0.1026, + "loss": 0.1021, "macro_f1": 0.3272727429866791, "num_tokens": 670510.0, "repeat_count": 1.0, - "routers_loss": 0.1592330038547516, + "routers_loss": 0.15554003417491913, "skip_count": 0.0, "step": 416, "text_loss": 0.3691870868206024 @@ -3969,13 +3969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.263671875, "learning_rate": 0.000834, - "loss": 0.0963, + "loss": 0.1013, "macro_f1": 0.3333333432674408, "num_tokens": 674761.0, "repeat_count": 0.0, - "routers_loss": 0.02291976846754551, + "routers_loss": 0.024516675621271133, "skip_count": 0.0, "step": 418, "text_loss": 0.32850381731987 @@ -3988,13 +3988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10888671875, "learning_rate": 0.000838, - "loss": 0.0634, + "loss": 0.0649, "macro_f1": 0.3333333432674408, "num_tokens": 678055.0, "repeat_count": 0.0, - "routers_loss": 0.010272650048136711, + "routers_loss": 0.011026890948414803, "skip_count": 0.0, "step": 420, "text_loss": 0.6637290716171265 @@ -4007,13 +4007,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.263671875, "learning_rate": 0.000842, - "loss": 0.0786, + "loss": 0.0771, "macro_f1": 0.3272727429866791, "num_tokens": 680979.0, "repeat_count": 0.0, - "routers_loss": 0.0692613497376442, + "routers_loss": 0.07451887428760529, "skip_count": 1.0, "step": 422, "text_loss": 0.27131685614585876 @@ -4026,13 +4026,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.1318359375, "learning_rate": 0.000846, - "loss": 0.0706, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 684144.0, "repeat_count": 1.0, - "routers_loss": 0.12713804841041565, + "routers_loss": 0.11341800540685654, "skip_count": 1.0, "step": 424, "text_loss": 0.652126669883728 @@ -4045,13 +4045,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.2158203125, "learning_rate": 0.00085, - "loss": 0.0758, + "loss": 0.0754, "macro_f1": 0.3272727429866791, "num_tokens": 687004.0, "repeat_count": 1.0, - "routers_loss": 0.08670130372047424, + "routers_loss": 0.08985847979784012, "skip_count": 0.0, "step": 426, "text_loss": 0.2589428424835205 @@ -4064,13 +4064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.23828125, "learning_rate": 0.000854, - "loss": 0.0857, + "loss": 0.0866, "macro_f1": 0.3333333432674408, "num_tokens": 689702.0, "repeat_count": 0.0, - "routers_loss": 0.01053862925618887, + "routers_loss": 0.011355436407029629, "skip_count": 0.0, "step": 428, "text_loss": 0.8909716010093689 @@ -4083,13 +4083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1435546875, "learning_rate": 0.000858, - "loss": 0.0615, + "loss": 0.0623, "macro_f1": 0.3333333432674408, "num_tokens": 692698.0, "repeat_count": 0.0, - "routers_loss": 0.012946994043886662, + "routers_loss": 0.013788948766887188, "skip_count": 0.0, "step": 430, "text_loss": 0.19141142070293427 @@ -4102,13 +4102,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1552734375, "learning_rate": 0.000862, - "loss": 0.0498, + "loss": 0.0499, "macro_f1": 0.32098764181137085, "num_tokens": 696007.0, "repeat_count": 0.0, - "routers_loss": 0.08222822099924088, + "routers_loss": 0.07998392730951309, "skip_count": 2.0, "step": 432, "text_loss": 0.1611809879541397 @@ -4121,13 +4121,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.173828125, "learning_rate": 0.000866, - "loss": 0.0532, + "loss": 0.0541, "macro_f1": 0.32098764181137085, "num_tokens": 700271.0, "repeat_count": 0.0, - "routers_loss": 0.07086442410945892, + "routers_loss": 0.06988382339477539, "skip_count": 2.0, "step": 434, "text_loss": 0.37254223227500916 @@ -4140,13 +4140,13 @@ "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1943359375, "learning_rate": 0.00087, - "loss": 0.0825, + "loss": 0.0834, "macro_f1": 0.2777777910232544, "num_tokens": 703519.0, "repeat_count": 3.0, - "routers_loss": 0.29007306694984436, + "routers_loss": 0.28240787982940674, "skip_count": 5.0, "step": 436, "text_loss": 0.29636648297309875 @@ -4159,13 +4159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.396484375, + "grad_norm": 0.423828125, "learning_rate": 0.000874, - "loss": 0.0658, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 706826.0, "repeat_count": 0.0, - "routers_loss": 0.014652491547167301, + "routers_loss": 0.013924967497587204, "skip_count": 0.0, "step": 438, "text_loss": 0.20867908000946045 @@ -4178,13 +4178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.000878, - "loss": 0.0685, + "loss": 0.0657, "macro_f1": 0.3333333432674408, "num_tokens": 710530.0, "repeat_count": 0.0, - "routers_loss": 0.013720969669520855, + "routers_loss": 0.01170142088085413, "skip_count": 0.0, "step": 440, "text_loss": 0.7273373007774353 @@ -4197,13 +4197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.171875, "learning_rate": 0.000882, - "loss": 0.0771, + "loss": 0.076, "macro_f1": 0.3333333432674408, "num_tokens": 713503.0, "repeat_count": 0.0, - "routers_loss": 0.011687638238072395, + "routers_loss": 0.011930872686207294, "skip_count": 0.0, "step": 442, "text_loss": 0.39314430952072144 @@ -4216,13 +4216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2490234375, "learning_rate": 0.0008860000000000001, - "loss": 0.0604, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 716582.0, "repeat_count": 0.0, - "routers_loss": 0.007869532331824303, + "routers_loss": 0.008630385622382164, "skip_count": 0.0, "step": 444, "text_loss": 0.5925271511077881 @@ -4230,18 +4230,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.0939242735544465, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.23046875, "learning_rate": 0.0008900000000000001, - "loss": 0.0797, - "macro_f1": 0.3076923191547394, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, "num_tokens": 719941.0, "repeat_count": 3.0, - "routers_loss": 0.3034668564796448, + "routers_loss": 0.3015584945678711, "skip_count": 1.0, "step": 446, "text_loss": 0.5059905052185059 @@ -4254,13 +4254,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.203125, "learning_rate": 0.000894, - "loss": 0.0823, + "loss": 0.0822, "macro_f1": 0.31446540355682373, "num_tokens": 723113.0, "repeat_count": 1.0, - "routers_loss": 0.11066079139709473, + "routers_loss": 0.10897493362426758, "skip_count": 1.0, "step": 448, "text_loss": 0.19616436958312988 @@ -4273,13 +4273,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.33984375, "learning_rate": 0.000898, - "loss": 0.0773, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 726193.0, "repeat_count": 0.0, - "routers_loss": 0.0755370482802391, + "routers_loss": 0.07236456125974655, "skip_count": 2.0, "step": 450, "text_loss": 0.1773054152727127 @@ -4292,13 +4292,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.28125, + "grad_norm": 0.3203125, "learning_rate": 0.000902, - "loss": 0.0596, + "loss": 0.058, "macro_f1": 0.3272727429866791, "num_tokens": 729275.0, "repeat_count": 1.0, - "routers_loss": 0.08470689505338669, + "routers_loss": 0.08184371143579483, "skip_count": 0.0, "step": 452, "text_loss": 0.4927310049533844 @@ -4311,13 +4311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.1953125, "learning_rate": 0.000906, - "loss": 0.0608, + "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 731948.0, "repeat_count": 0.0, - "routers_loss": 0.0130238626152277, + "routers_loss": 0.014033539220690727, "skip_count": 0.0, "step": 454, "text_loss": 0.4745742678642273 @@ -4330,13 +4330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.00091, - "loss": 0.0652, + "loss": 0.0651, "macro_f1": 0.3333333432674408, "num_tokens": 735351.0, "repeat_count": 0.0, - "routers_loss": 0.007108641788363457, + "routers_loss": 0.0071774693205952644, "skip_count": 0.0, "step": 456, "text_loss": 0.18523462116718292 @@ -4351,11 +4351,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.400390625, "learning_rate": 0.0009140000000000001, - "loss": 0.0746, + "loss": 0.0738, "macro_f1": 0.5492662787437439, "num_tokens": 738587.0, "repeat_count": 0.0, - "routers_loss": 0.06834109872579575, + "routers_loss": 0.07781517505645752, "skip_count": 2.0, "step": 458, "text_loss": 0.3459635376930237 @@ -4368,13 +4368,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.28125, "learning_rate": 0.0009180000000000001, - "loss": 0.0733, + "loss": 0.0723, "macro_f1": 0.3076923191547394, "num_tokens": 741779.0, "repeat_count": 0.0, - "routers_loss": 0.10230778902769089, + "routers_loss": 0.09529037028551102, "skip_count": 2.0, "step": 460, "text_loss": 0.20197433233261108 @@ -4387,13 +4387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.0009220000000000001, - "loss": 0.0528, + "loss": 0.0519, "macro_f1": 0.3333333432674408, "num_tokens": 745355.0, "repeat_count": 0.0, - "routers_loss": 0.009987542405724525, + "routers_loss": 0.009765669703483582, "skip_count": 0.0, "step": 462, "text_loss": 0.7031404376029968 @@ -4406,13 +4406,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009260000000000001, - "loss": 0.0536, + "loss": 0.0527, "macro_f1": 0.3272727429866791, "num_tokens": 748628.0, "repeat_count": 0.0, - "routers_loss": 0.03448869287967682, + "routers_loss": 0.03344850242137909, "skip_count": 1.0, "step": 464, "text_loss": 0.21274663507938385 @@ -4425,13 +4425,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.173828125, "learning_rate": 0.00093, - "loss": 0.053, + "loss": 0.0534, "macro_f1": 0.3076923191547394, "num_tokens": 751472.0, "repeat_count": 2.0, - "routers_loss": 0.13631699979305267, + "routers_loss": 0.1354292333126068, "skip_count": 2.0, "step": 466, "text_loss": 0.5350717306137085 @@ -4444,13 +4444,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.142578125, "learning_rate": 0.000934, - "loss": 0.06, + "loss": 0.0598, "macro_f1": 0.3272727429866791, "num_tokens": 754479.0, "repeat_count": 0.0, - "routers_loss": 0.053951870650053024, + "routers_loss": 0.056420840322971344, "skip_count": 1.0, "step": 468, "text_loss": 0.28153330087661743 @@ -4463,13 +4463,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.234375, "learning_rate": 0.0009379999999999999, - "loss": 0.059, + "loss": 0.0597, "macro_f1": 0.31446540355682373, "num_tokens": 757872.0, "repeat_count": 1.0, - "routers_loss": 0.14479905366897583, + "routers_loss": 0.1622387170791626, "skip_count": 1.0, "step": 470, "text_loss": 0.22956843674182892 @@ -4482,13 +4482,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.44140625, + "grad_norm": 0.5, "learning_rate": 0.000942, - "loss": 0.0913, + "loss": 0.0953, "macro_f1": 0.32098764181137085, "num_tokens": 760468.0, "repeat_count": 0.0, - "routers_loss": 0.056221429258584976, + "routers_loss": 0.05146972835063934, "skip_count": 2.0, "step": 472, "text_loss": 0.4513966739177704 @@ -4501,13 +4501,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1904296875, + "grad_norm": 0.212890625, "learning_rate": 0.000946, - "loss": 0.0591, + "loss": 0.0592, "macro_f1": 0.3272727429866791, "num_tokens": 763519.0, "repeat_count": 1.0, - "routers_loss": 0.09729792177677155, + "routers_loss": 0.09022669494152069, "skip_count": 0.0, "step": 474, "text_loss": 0.25758957862854004 @@ -4520,13 +4520,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1259765625, "learning_rate": 0.00095, - "loss": 0.0496, + "loss": 0.0498, "macro_f1": 0.3272727429866791, "num_tokens": 767391.0, "repeat_count": 0.0, - "routers_loss": 0.029447713866829872, + "routers_loss": 0.03044828027486801, "skip_count": 1.0, "step": 476, "text_loss": 0.21366681158542633 @@ -4539,13 +4539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.271484375, + "grad_norm": 0.291015625, "learning_rate": 0.000954, - "loss": 0.0801, + "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 770338.0, "repeat_count": 0.0, - "routers_loss": 0.09337342530488968, + "routers_loss": 0.10397060960531235, "skip_count": 1.0, "step": 478, "text_loss": 1.0396177768707275 @@ -4560,11 +4560,11 @@ "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.000958, - "loss": 0.1102, + "loss": 0.1099, "macro_f1": 0.285714328289032, "num_tokens": 773699.0, "repeat_count": 2.0, - "routers_loss": 0.23193210363388062, + "routers_loss": 0.22604143619537354, "skip_count": 4.0, "step": 480, "text_loss": 0.2570283114910126 @@ -4572,18 +4572,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.2629879659524508, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.146484375, "learning_rate": 0.000962, - "loss": 0.0669, - "macro_f1": 0.3272727429866791, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, "num_tokens": 777473.0, "repeat_count": 0.0, - "routers_loss": 0.046257760375738144, + "routers_loss": 0.048258859664201736, "skip_count": 1.0, "step": 482, "text_loss": 0.2540103495121002 @@ -4596,13 +4596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.197265625, "learning_rate": 0.000966, - "loss": 0.0552, + "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 780833.0, "repeat_count": 0.0, - "routers_loss": 0.01683143898844719, + "routers_loss": 0.023018671199679375, "skip_count": 0.0, "step": 484, "text_loss": 0.38524550199508667 @@ -4615,13 +4615,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.326171875, + "grad_norm": 0.314453125, "learning_rate": 0.0009699999999999999, - "loss": 0.071, + "loss": 0.0709, "macro_f1": 0.3272727429866791, "num_tokens": 783656.0, "repeat_count": 0.0, - "routers_loss": 0.04129387438297272, + "routers_loss": 0.044845327734947205, "skip_count": 1.0, "step": 486, "text_loss": 0.5859048366546631 @@ -4634,13 +4634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000974, - "loss": 0.0605, + "loss": 0.0615, "macro_f1": 0.3333333432674408, "num_tokens": 787173.0, "repeat_count": 0.0, - "routers_loss": 0.01262948103249073, + "routers_loss": 0.010898692533373833, "skip_count": 0.0, "step": 488, "text_loss": 0.3456067442893982 @@ -4653,13 +4653,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2578125, + "grad_norm": 0.263671875, "learning_rate": 0.000978, - "loss": 0.081, + "loss": 0.0796, "macro_f1": 0.32098764181137085, "num_tokens": 790395.0, "repeat_count": 0.0, - "routers_loss": 0.07404553890228271, + "routers_loss": 0.06497956812381744, "skip_count": 2.0, "step": 490, "text_loss": 0.3751123249530792 @@ -4672,13 +4672,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.2158203125, "learning_rate": 0.000982, - "loss": 0.0751, + "loss": 0.0772, "macro_f1": 0.3272727429866791, "num_tokens": 793137.0, "repeat_count": 0.0, - "routers_loss": 0.06795930862426758, + "routers_loss": 0.07763728499412537, "skip_count": 1.0, "step": 492, "text_loss": 0.43296709656715393 @@ -4691,13 +4691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.0009860000000000001, - "loss": 0.0804, + "loss": 0.0819, "macro_f1": 0.3333333432674408, "num_tokens": 796497.0, "repeat_count": 0.0, - "routers_loss": 0.02233024686574936, + "routers_loss": 0.02127906307578087, "skip_count": 0.0, "step": 494, "text_loss": 0.4841311275959015 @@ -4710,13 +4710,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2138671875, "learning_rate": 0.00099, - "loss": 0.0731, + "loss": 0.073, "macro_f1": 0.3272727429866791, "num_tokens": 799361.0, "repeat_count": 1.0, - "routers_loss": 0.07979031652212143, + "routers_loss": 0.09518691152334213, "skip_count": 0.0, "step": 496, "text_loss": 0.5094487071037292 @@ -4729,13 +4729,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1298828125, + "grad_norm": 0.130859375, "learning_rate": 0.000994, - "loss": 0.0795, + "loss": 0.0789, "macro_f1": 0.5492662787437439, "num_tokens": 802629.0, "repeat_count": 0.0, - "routers_loss": 0.045646365731954575, + "routers_loss": 0.0563947930932045, "skip_count": 2.0, "step": 498, "text_loss": 0.42783617973327637 @@ -4748,13 +4748,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.1865234375, "learning_rate": 0.000998, "loss": 0.0476, "macro_f1": 0.3272727429866791, "num_tokens": 805881.0, "repeat_count": 1.0, - "routers_loss": 0.09717849642038345, + "routers_loss": 0.10570426285266876, "skip_count": 0.0, "step": 500, "text_loss": 0.28395503759384155 @@ -4767,13 +4767,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.30078125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009999999760498814, - "loss": 0.0894, + "loss": 0.0849, "macro_f1": 0.5492662787437439, "num_tokens": 809283.0, "repeat_count": 0.0, - "routers_loss": 0.03948225453495979, + "routers_loss": 0.031202208250761032, "skip_count": 2.0, "step": 502, "text_loss": 0.32970911264419556 @@ -4786,13 +4786,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.15625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009999997844489475, - "loss": 0.0557, + "loss": 0.0574, "macro_f1": 0.3272727429866791, "num_tokens": 812440.0, "repeat_count": 0.0, - "routers_loss": 0.0742638111114502, + "routers_loss": 0.07647835463285446, "skip_count": 1.0, "step": 504, "text_loss": 0.4901447296142578 @@ -4805,13 +4805,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.25, "learning_rate": 0.000999999401247153, - "loss": 0.0682, + "loss": 0.0668, "macro_f1": 0.32098764181137085, "num_tokens": 815716.0, "repeat_count": 0.0, - "routers_loss": 0.08293049037456512, + "routers_loss": 0.08515176922082901, "skip_count": 2.0, "step": 506, "text_loss": 0.6157599687576294 @@ -4824,13 +4824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.26171875, + "grad_norm": 0.25390625, "learning_rate": 0.0009999988264446445, - "loss": 0.0697, + "loss": 0.0686, "macro_f1": 0.3333333432674408, "num_tokens": 819086.0, "repeat_count": 0.0, - "routers_loss": 0.010080376639962196, + "routers_loss": 0.00946938619017601, "skip_count": 0.0, "step": 508, "text_loss": 0.5053519010543823 @@ -4843,13 +4843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1640625, "learning_rate": 0.0009999980600416424, - "loss": 0.0611, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 822268.0, "repeat_count": 0.0, - "routers_loss": 0.009179878048598766, + "routers_loss": 0.01058756373822689, "skip_count": 0.0, "step": 510, "text_loss": 0.5570021867752075 @@ -4862,13 +4862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11083984375, + "grad_norm": 0.1240234375, "learning_rate": 0.000999997102038441, - "loss": 0.0689, + "loss": 0.0678, "macro_f1": 0.3333333432674408, "num_tokens": 825728.0, "repeat_count": 0.0, - "routers_loss": 0.006718529388308525, + "routers_loss": 0.008705209009349346, "skip_count": 0.0, "step": 512, "text_loss": 0.6519040465354919 @@ -4881,13 +4881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.220703125, "learning_rate": 0.0009999959524354064, - "loss": 0.0826, + "loss": 0.083, "macro_f1": 0.3272727429866791, "num_tokens": 829459.0, "repeat_count": 0.0, - "routers_loss": 0.049344487488269806, + "routers_loss": 0.04024193435907364, "skip_count": 1.0, "step": 514, "text_loss": 0.5290043950080872 @@ -4900,13 +4900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.259765625, + "grad_norm": 0.25390625, "learning_rate": 0.00099999461123298, - "loss": 0.0739, + "loss": 0.0727, "macro_f1": 0.3333333432674408, "num_tokens": 832291.0, "repeat_count": 0.0, - "routers_loss": 0.013402626849710941, + "routers_loss": 0.015742862597107887, "skip_count": 0.0, "step": 516, "text_loss": 0.7910057902336121 @@ -4919,13 +4919,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.232421875, + "grad_norm": 0.2275390625, "learning_rate": 0.000999993078431675, - "loss": 0.0761, + "loss": 0.0759, "macro_f1": 0.3076923191547394, "num_tokens": 835399.0, "repeat_count": 1.0, - "routers_loss": 0.16964484751224518, + "routers_loss": 0.16753782331943512, "skip_count": 3.0, "step": 518, "text_loss": 0.45196083188056946 @@ -4938,13 +4938,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.236328125, "learning_rate": 0.0009999913540320792, - "loss": 0.095, + "loss": 0.0968, "macro_f1": 0.31446540355682373, "num_tokens": 838993.0, "repeat_count": 0.0, - "routers_loss": 0.08609295636415482, + "routers_loss": 0.09357143193483353, "skip_count": 2.0, "step": 520, "text_loss": 0.5499435663223267 @@ -4957,13 +4957,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2392578125, + "grad_norm": 0.2451171875, "learning_rate": 0.0009999894380348536, - "loss": 0.0816, + "loss": 0.0821, "macro_f1": 0.5492662787437439, "num_tokens": 842652.0, "repeat_count": 0.0, - "routers_loss": 0.05354784056544304, + "routers_loss": 0.056803856045007706, "skip_count": 2.0, "step": 522, "text_loss": 0.197520449757576 @@ -4976,13 +4976,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.2236328125, + "grad_norm": 0.2333984375, "learning_rate": 0.000999987330440732, - "loss": 0.0715, + "loss": 0.0725, "macro_f1": 0.4871794879436493, "num_tokens": 847061.0, "repeat_count": 0.0, - "routers_loss": 0.09146631509065628, + "routers_loss": 0.08962195366621017, "skip_count": 3.0, "step": 524, "text_loss": 0.27509039640426636 @@ -4995,13 +4995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1875, + "grad_norm": 0.189453125, "learning_rate": 0.000999985031250522, - "loss": 0.0574, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 850780.0, "repeat_count": 0.0, - "routers_loss": 0.02344255894422531, + "routers_loss": 0.022930558770895004, "skip_count": 0.0, "step": 526, "text_loss": 0.13291706144809723 @@ -5014,13 +5014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.197265625, "learning_rate": 0.0009999825404651053, - "loss": 0.0621, + "loss": 0.0614, "macro_f1": 0.3333333432674408, "num_tokens": 853886.0, "repeat_count": 0.0, - "routers_loss": 0.018271517008543015, + "routers_loss": 0.017097990959882736, "skip_count": 0.0, "step": 528, "text_loss": 0.21706295013427734 @@ -5033,13 +5033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.212890625, "learning_rate": 0.0009999798580854356, - "loss": 0.0717, + "loss": 0.0724, "macro_f1": 0.3333333432674408, "num_tokens": 857364.0, "repeat_count": 0.0, - "routers_loss": 0.026990914717316628, + "routers_loss": 0.02831801027059555, "skip_count": 0.0, "step": 530, "text_loss": 0.9035662412643433 @@ -5052,13 +5052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1591796875, "learning_rate": 0.000999976984112541, - "loss": 0.0681, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 860661.0, "repeat_count": 0.0, - "routers_loss": 0.019737249240279198, + "routers_loss": 0.019671892747282982, "skip_count": 0.0, "step": 532, "text_loss": 0.8354863524436951 @@ -5071,13 +5071,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.3046875, + "grad_norm": 0.2890625, "learning_rate": 0.0009999739185475231, - "loss": 0.0978, + "loss": 0.0963, "macro_f1": 0.47333335876464844, "num_tokens": 864124.0, "repeat_count": 2.0, - "routers_loss": 0.212640181183815, + "routers_loss": 0.21383361518383026, "skip_count": 3.0, "step": 534, "text_loss": 0.23422949016094208 @@ -5090,13 +5090,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999706613915565, - "loss": 0.0602, + "loss": 0.0598, "macro_f1": 0.32098767161369324, "num_tokens": 866976.0, "repeat_count": 0.0, - "routers_loss": 0.07302755117416382, + "routers_loss": 0.07158871740102768, "skip_count": 1.0, "step": 536, "text_loss": 0.11800774186849594 @@ -5109,13 +5109,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.296875, + "grad_norm": 0.26953125, "learning_rate": 0.0009999672126458894, - "loss": 0.0825, + "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 870549.0, "repeat_count": 0.0, - "routers_loss": 0.08667246252298355, + "routers_loss": 0.08185924589633942, "skip_count": 1.0, "step": 538, "text_loss": 0.19232480227947235 @@ -5128,13 +5128,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.1396484375, "learning_rate": 0.000999963572311843, - "loss": 0.0597, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 873733.0, "repeat_count": 0.0, - "routers_loss": 0.015047167427837849, + "routers_loss": 0.01633382774889469, "skip_count": 0.0, "step": 540, "text_loss": 0.3725031912326813 @@ -5147,13 +5147,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.15234375, "learning_rate": 0.0009999597403908128, - "loss": 0.076, + "loss": 0.0761, "macro_f1": 0.3272727429866791, "num_tokens": 877099.0, "repeat_count": 0.0, - "routers_loss": 0.07481446117162704, + "routers_loss": 0.0782657191157341, "skip_count": 1.0, "step": 542, "text_loss": 0.17589199542999268 @@ -5166,13 +5166,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.2177734375, "learning_rate": 0.0009999557168842669, - "loss": 0.0724, + "loss": 0.0716, "macro_f1": 0.5492662787437439, "num_tokens": 879883.0, "repeat_count": 0.0, - "routers_loss": 0.049495212733745575, + "routers_loss": 0.05275818333029747, "skip_count": 2.0, "step": 544, "text_loss": 0.26448264718055725 @@ -5185,13 +5185,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.2490234375, "learning_rate": 0.0009999515017937468, - "loss": 0.0718, + "loss": 0.071, "macro_f1": 0.32098764181137085, "num_tokens": 882223.0, "repeat_count": 0.0, - "routers_loss": 0.08043002337217331, + "routers_loss": 0.09335892647504807, "skip_count": 2.0, "step": 546, "text_loss": 0.208544060587883 @@ -5204,13 +5204,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.34765625, + "grad_norm": 0.376953125, "learning_rate": 0.0009999470951208684, - "loss": 0.086, + "loss": 0.0855, "macro_f1": 0.32098764181137085, "num_tokens": 885241.0, "repeat_count": 2.0, - "routers_loss": 0.22461950778961182, + "routers_loss": 0.22983254492282867, "skip_count": 0.0, "step": 548, "text_loss": 0.6612338423728943 @@ -5223,13 +5223,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.216796875, "learning_rate": 0.00099994249686732, - "loss": 0.0798, + "loss": 0.0786, "macro_f1": 0.3272727429866791, "num_tokens": 887897.0, "repeat_count": 1.0, - "routers_loss": 0.11754962801933289, + "routers_loss": 0.12858282029628754, "skip_count": 0.0, "step": 550, "text_loss": 0.4673548936843872 @@ -5242,13 +5242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1611328125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009999377070348638, - "loss": 0.0978, + "loss": 0.0944, "macro_f1": 0.3333333432674408, "num_tokens": 891224.0, "repeat_count": 0.0, - "routers_loss": 0.017412789165973663, + "routers_loss": 0.017421770840883255, "skip_count": 0.0, "step": 552, "text_loss": 0.6419258117675781 @@ -5261,13 +5261,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15625, "learning_rate": 0.000999932725625335, - "loss": 0.0792, + "loss": 0.0791, "macro_f1": 0.32098764181137085, "num_tokens": 894578.0, "repeat_count": 0.0, - "routers_loss": 0.08969525247812271, + "routers_loss": 0.07890026271343231, "skip_count": 2.0, "step": 554, "text_loss": 0.5970752239227295 @@ -5280,13 +5280,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.216796875, "learning_rate": 0.0009999275526406427, - "loss": 0.0803, + "loss": 0.0796, "macro_f1": 0.31446540355682373, "num_tokens": 897145.0, "repeat_count": 1.0, - "routers_loss": 0.09876437485218048, + "routers_loss": 0.09836960583925247, "skip_count": 1.0, "step": 556, "text_loss": 0.752425491809845 @@ -5299,13 +5299,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.1875, "learning_rate": 0.0009999221880827693, - "loss": 0.0887, + "loss": 0.0882, "macro_f1": 0.3333333432674408, "num_tokens": 900565.0, "repeat_count": 0.0, - "routers_loss": 0.019108204171061516, + "routers_loss": 0.017694659531116486, "skip_count": 0.0, "step": 558, "text_loss": 0.195619136095047 @@ -5318,32 +5318,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.2021484375, "learning_rate": 0.0009999166319537703, - "loss": 0.0573, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 903506.0, "repeat_count": 0.0, - "routers_loss": 0.019048813730478287, + "routers_loss": 0.019375264644622803, "skip_count": 0.0, "step": 560, "text_loss": 0.4603337347507477 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 2.638685060170238, - "f1_execute": 0.943396270275116, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "f1_skip": 0.5, + "grad_norm": 0.146484375, "learning_rate": 0.0009999108842557748, - "loss": 0.0947, - "macro_f1": 0.3144654333591461, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, "num_tokens": 906380.0, "repeat_count": 0.0, - "routers_loss": 0.11889495700597763, + "routers_loss": 0.12013207376003265, "skip_count": 3.0, "step": 562, "text_loss": 0.6279402375221252 @@ -5356,13 +5356,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.228515625, + "grad_norm": 0.255859375, "learning_rate": 0.0009999049449909854, - "loss": 0.0771, + "loss": 0.0799, "macro_f1": 0.3272727429866791, "num_tokens": 909116.0, "repeat_count": 0.0, - "routers_loss": 0.06202332302927971, + "routers_loss": 0.06441342830657959, "skip_count": 1.0, "step": 564, "text_loss": 0.23741699755191803 @@ -5375,13 +5375,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.15234375, "learning_rate": 0.0009998988141616781, - "loss": 0.0623, + "loss": 0.064, "macro_f1": 0.32098767161369324, "num_tokens": 912189.0, "repeat_count": 0.0, - "routers_loss": 0.08294244855642319, + "routers_loss": 0.08309414982795715, "skip_count": 1.0, "step": 566, "text_loss": 0.27780941128730774 @@ -5394,13 +5394,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009998924917702023, - "loss": 0.0885, + "loss": 0.0876, "macro_f1": 0.3272727429866791, "num_tokens": 916279.0, "repeat_count": 1.0, - "routers_loss": 0.07545182853937149, + "routers_loss": 0.07197169959545135, "skip_count": 0.0, "step": 568, "text_loss": 0.6371755599975586 @@ -5413,13 +5413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2255859375, "learning_rate": 0.0009998859778189806, - "loss": 0.0712, + "loss": 0.0706, "macro_f1": 0.3333333432674408, "num_tokens": 919490.0, "repeat_count": 0.0, - "routers_loss": 0.008711219765245914, + "routers_loss": 0.008022273890674114, "skip_count": 0.0, "step": 570, "text_loss": 0.6028938889503479 @@ -5432,13 +5432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1650390625, "learning_rate": 0.000999879272310509, - "loss": 0.0837, + "loss": 0.084, "macro_f1": 0.3333333432674408, "num_tokens": 923694.0, "repeat_count": 0.0, - "routers_loss": 0.01639273390173912, + "routers_loss": 0.01634674146771431, "skip_count": 0.0, "step": 572, "text_loss": 0.7177054286003113 @@ -5451,13 +5451,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.17578125, "learning_rate": 0.0009998723752473574, - "loss": 0.0707, + "loss": 0.0716, "macro_f1": 0.3272727429866791, "num_tokens": 926933.0, "repeat_count": 0.0, - "routers_loss": 0.04997137933969498, + "routers_loss": 0.060559045523405075, "skip_count": 1.0, "step": 574, "text_loss": 0.5203254818916321 @@ -5470,13 +5470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.185546875, "learning_rate": 0.0009998652866321687, - "loss": 0.0799, + "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 929832.0, "repeat_count": 0.0, - "routers_loss": 0.011360209435224533, + "routers_loss": 0.011485611088573933, "skip_count": 0.0, "step": 576, "text_loss": 0.6147452592849731 @@ -5489,13 +5489,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1552734375, "learning_rate": 0.000999858006467659, - "loss": 0.0658, + "loss": 0.0649, "macro_f1": 0.29333335161209106, "num_tokens": 933266.0, "repeat_count": 2.0, - "routers_loss": 0.31349560618400574, + "routers_loss": 0.2929030954837799, "skip_count": 4.0, "step": 578, "text_loss": 0.1720666140317917 @@ -5508,13 +5508,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.24609375, "learning_rate": 0.0009998505347566186, - "loss": 0.0801, + "loss": 0.0782, "macro_f1": 0.32098764181137085, "num_tokens": 937545.0, "repeat_count": 0.0, - "routers_loss": 0.058660347014665604, + "routers_loss": 0.053780000656843185, "skip_count": 2.0, "step": 580, "text_loss": 0.3258405327796936 @@ -5527,13 +5527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1416015625, "learning_rate": 0.00099984287150191, - "loss": 0.0578, + "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 941001.0, "repeat_count": 0.0, - "routers_loss": 0.025836754590272903, + "routers_loss": 0.02637636847794056, "skip_count": 0.0, "step": 582, "text_loss": 0.23762771487236023 @@ -5546,13 +5546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1552734375, "learning_rate": 0.0009998350167064705, - "loss": 0.0683, + "loss": 0.0672, "macro_f1": 0.3333333432674408, "num_tokens": 943989.0, "repeat_count": 0.0, - "routers_loss": 0.016504868865013123, + "routers_loss": 0.01637580618262291, "skip_count": 0.0, "step": 584, "text_loss": 0.7460582852363586 @@ -5565,13 +5565,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009998269703733096, - "loss": 0.0685, + "loss": 0.0686, "macro_f1": 0.3272727429866791, "num_tokens": 947245.0, "repeat_count": 1.0, - "routers_loss": 0.1379794180393219, + "routers_loss": 0.13934117555618286, "skip_count": 0.0, "step": 586, "text_loss": 0.5284690260887146 @@ -5584,13 +5584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.13671875, "learning_rate": 0.0009998187325055106, - "loss": 0.0657, + "loss": 0.0667, "macro_f1": 0.3333333432674408, "num_tokens": 950116.0, "repeat_count": 0.0, - "routers_loss": 0.01802757754921913, + "routers_loss": 0.02138397842645645, "skip_count": 0.0, "step": 588, "text_loss": 0.3920256197452545 @@ -5603,13 +5603,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.1533203125, "learning_rate": 0.0009998103031062305, - "loss": 0.0762, + "loss": 0.0778, "macro_f1": 0.3333333432674408, "num_tokens": 953277.0, "repeat_count": 0.0, - "routers_loss": 0.006902900990098715, + "routers_loss": 0.007098200265318155, "skip_count": 0.0, "step": 590, "text_loss": 0.7472905516624451 @@ -5622,13 +5622,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3046875, + "grad_norm": 0.318359375, "learning_rate": 0.0009998016821786994, - "loss": 0.0912, + "loss": 0.0872, "macro_f1": 0.32098764181137085, "num_tokens": 958229.0, "repeat_count": 1.0, - "routers_loss": 0.08348741382360458, + "routers_loss": 0.07946522533893585, "skip_count": 1.0, "step": 592, "text_loss": 0.5506448745727539 @@ -5641,13 +5641,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.000999792869726221, - "loss": 0.0527, + "loss": 0.0523, "macro_f1": 0.3272727429866791, "num_tokens": 961016.0, "repeat_count": 0.0, - "routers_loss": 0.08290062099695206, + "routers_loss": 0.0850791186094284, "skip_count": 1.0, "step": 594, "text_loss": 0.3824431002140045 @@ -5660,13 +5660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009997838657521717, - "loss": 0.0643, + "loss": 0.0632, "macro_f1": 0.3333333432674408, "num_tokens": 963847.0, "repeat_count": 0.0, - "routers_loss": 0.018620988354086876, + "routers_loss": 0.016370445489883423, "skip_count": 0.0, "step": 596, "text_loss": 0.2139475792646408 @@ -5679,13 +5679,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12890625, "learning_rate": 0.0009997746702600026, - "loss": 0.073, + "loss": 0.0702, "macro_f1": 0.307692289352417, "num_tokens": 966619.0, "repeat_count": 0.0, - "routers_loss": 0.1211671382188797, + "routers_loss": 0.1310746818780899, "skip_count": 3.0, "step": 598, "text_loss": 0.3651018440723419 @@ -5698,13 +5698,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2353515625, + "grad_norm": 0.23828125, "learning_rate": 0.0009997652832532372, - "loss": 0.079, + "loss": 0.0792, "macro_f1": 0.3272727429866791, "num_tokens": 970418.0, "repeat_count": 1.0, - "routers_loss": 0.15485027432441711, + "routers_loss": 0.14303378760814667, "skip_count": 0.0, "step": 600, "text_loss": 0.7094736099243164 @@ -5717,13 +5717,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009997557047354722, - "loss": 0.0562, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 973491.0, "repeat_count": 0.0, - "routers_loss": 0.036684274673461914, + "routers_loss": 0.03334212675690651, "skip_count": 1.0, "step": 602, "text_loss": 0.4812237024307251 @@ -5731,18 +5731,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 2.835926034634576, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.302734375, + "grad_norm": 0.2890625, "learning_rate": 0.0009997459347103783, - "loss": 0.0985, - "macro_f1": 0.3333333432674408, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, "num_tokens": 976672.0, "repeat_count": 0.0, - "routers_loss": 0.026901578530669212, + "routers_loss": 0.02831871062517166, "skip_count": 0.0, "step": 604, "text_loss": 0.21737146377563477 @@ -5755,13 +5755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12158203125, + "grad_norm": 0.1298828125, "learning_rate": 0.0009997359731816998, - "loss": 0.0632, + "loss": 0.0646, "macro_f1": 0.3333333432674408, "num_tokens": 979898.0, "repeat_count": 0.0, - "routers_loss": 0.01700405217707157, + "routers_loss": 0.017968013882637024, "skip_count": 0.0, "step": 606, "text_loss": 0.5458008050918579 @@ -5774,13 +5774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.224609375, "learning_rate": 0.0009997258201532536, - "loss": 0.0758, + "loss": 0.0751, "macro_f1": 0.3333333432674408, "num_tokens": 982811.0, "repeat_count": 0.0, - "routers_loss": 0.015013590455055237, + "routers_loss": 0.016256732866168022, "skip_count": 0.0, "step": 608, "text_loss": 0.8643257021903992 @@ -5793,13 +5793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.2275390625, "learning_rate": 0.0009997154756289303, - "loss": 0.0576, + "loss": 0.0561, "macro_f1": 0.3333333432674408, "num_tokens": 985245.0, "repeat_count": 0.0, - "routers_loss": 0.02037946693599224, + "routers_loss": 0.021214161068201065, "skip_count": 0.0, "step": 610, "text_loss": 0.2204967886209488 @@ -5812,13 +5812,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.150390625, "learning_rate": 0.000999704939612694, - "loss": 0.0648, + "loss": 0.0636, "macro_f1": 0.3006536364555359, "num_tokens": 988539.0, "repeat_count": 3.0, - "routers_loss": 0.22834022343158722, + "routers_loss": 0.23249399662017822, "skip_count": 2.0, "step": 612, "text_loss": 0.32489025592803955 @@ -5831,13 +5831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.095703125, "learning_rate": 0.0009996942121085824, - "loss": 0.0449, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 991660.0, "repeat_count": 0.0, - "routers_loss": 0.009838113561272621, + "routers_loss": 0.010706410743296146, "skip_count": 0.0, "step": 614, "text_loss": 0.4551754891872406 @@ -5850,13 +5850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.353515625, + "grad_norm": 0.3671875, "learning_rate": 0.000999683293120706, - "loss": 0.1009, + "loss": 0.1016, "macro_f1": 0.3333333432674408, "num_tokens": 994828.0, "repeat_count": 0.0, - "routers_loss": 0.005943270865827799, + "routers_loss": 0.006676184479147196, "skip_count": 0.0, "step": 616, "text_loss": 0.6212068200111389 @@ -5869,13 +5869,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.38671875, + "grad_norm": 0.408203125, "learning_rate": 0.0009996721826532491, - "loss": 0.0941, + "loss": 0.0976, "macro_f1": 0.3076923191547394, "num_tokens": 997951.0, "repeat_count": 2.0, - "routers_loss": 0.21597740054130554, + "routers_loss": 0.2148125320672989, "skip_count": 2.0, "step": 618, "text_loss": 0.26514527201652527 @@ -5888,13 +5888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1904296875, "learning_rate": 0.000999660880710469, - "loss": 0.0896, + "loss": 0.0909, "macro_f1": 0.3333333432674408, "num_tokens": 1001139.0, "repeat_count": 0.0, - "routers_loss": 0.023726588115096092, + "routers_loss": 0.022332455962896347, "skip_count": 0.0, "step": 620, "text_loss": 0.26131340861320496 @@ -5907,13 +5907,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.169921875, "learning_rate": 0.0009996493872966971, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1003678.0, "repeat_count": 1.0, - "routers_loss": 0.08467255532741547, + "routers_loss": 0.08348730951547623, "skip_count": 0.0, "step": 622, "text_loss": 0.19151706993579865 @@ -5926,13 +5926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.173828125, "learning_rate": 0.0009996377024163374, - "loss": 0.0816, + "loss": 0.0822, "macro_f1": 0.3333333432674408, "num_tokens": 1007082.0, "repeat_count": 0.0, - "routers_loss": 0.029468854889273643, + "routers_loss": 0.028577150776982307, "skip_count": 0.0, "step": 624, "text_loss": 0.305387407541275 @@ -5945,13 +5945,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.11279296875, "learning_rate": 0.0009996258260738676, - "loss": 0.0891, + "loss": 0.0892, "macro_f1": 0.3272727429866791, "num_tokens": 1010064.0, "repeat_count": 1.0, - "routers_loss": 0.09438466280698776, + "routers_loss": 0.08312026411294937, "skip_count": 0.0, "step": 626, "text_loss": 0.49436143040657043 @@ -5964,13 +5964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009996137582738388, - "loss": 0.0581, + "loss": 0.0591, "macro_f1": 0.3333333432674408, "num_tokens": 1013462.0, "repeat_count": 0.0, - "routers_loss": 0.013679586350917816, + "routers_loss": 0.013337327167391777, "skip_count": 0.0, "step": 628, "text_loss": 0.6515294313430786 @@ -5983,13 +5983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.140625, "learning_rate": 0.000999601499020875, - "loss": 0.0528, + "loss": 0.0537, "macro_f1": 0.3333333432674408, "num_tokens": 1016246.0, "repeat_count": 0.0, - "routers_loss": 0.029532987624406815, + "routers_loss": 0.029126765206456184, "skip_count": 0.0, "step": 630, "text_loss": 0.18834827840328217 @@ -6002,13 +6002,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.095703125, "learning_rate": 0.0009995890483196746, - "loss": 0.0601, + "loss": 0.0602, "macro_f1": 0.3272727429866791, "num_tokens": 1019286.0, "repeat_count": 0.0, - "routers_loss": 0.05516733601689339, + "routers_loss": 0.054844800382852554, "skip_count": 1.0, "step": 632, "text_loss": 0.6988179087638855 @@ -6021,13 +6021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.357421875, + "grad_norm": 0.322265625, "learning_rate": 0.0009995764061750086, - "loss": 0.0785, + "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1022207.0, "repeat_count": 0.0, - "routers_loss": 0.010254866443574429, + "routers_loss": 0.010095693171024323, "skip_count": 0.0, "step": 634, "text_loss": 0.558451771736145 @@ -6040,13 +6040,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2890625, "learning_rate": 0.000999563572591721, - "loss": 0.0518, + "loss": 0.0521, "macro_f1": 0.32098764181137085, "num_tokens": 1025319.0, "repeat_count": 1.0, - "routers_loss": 0.07528360933065414, + "routers_loss": 0.0698433518409729, "skip_count": 1.0, "step": 636, "text_loss": 0.5961872935295105 @@ -6059,13 +6059,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1064453125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009995505475747302, - "loss": 0.0844, + "loss": 0.0849, "macro_f1": 0.3272727429866791, "num_tokens": 1028362.0, "repeat_count": 0.0, - "routers_loss": 0.04301584139466286, + "routers_loss": 0.040211405605077744, "skip_count": 1.0, "step": 638, "text_loss": 0.546863317489624 @@ -6078,13 +6078,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.119140625, "learning_rate": 0.0009995373311290272, - "loss": 0.0699, + "loss": 0.0709, "macro_f1": 0.3144654333591461, "num_tokens": 1032199.0, "repeat_count": 2.0, - "routers_loss": 0.14521080255508423, + "routers_loss": 0.1457643061876297, "skip_count": 1.0, "step": 640, "text_loss": 0.2137298285961151 @@ -6097,13 +6097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1279296875, "learning_rate": 0.0009995239232596764, - "loss": 0.0543, + "loss": 0.0545, "macro_f1": 0.3333333432674408, "num_tokens": 1035801.0, "repeat_count": 0.0, - "routers_loss": 0.01074797473847866, + "routers_loss": 0.011394930072128773, "skip_count": 0.0, "step": 642, "text_loss": 0.43054503202438354 @@ -6116,13 +6116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.1015625, "learning_rate": 0.0009995103239718163, - "loss": 0.0659, + "loss": 0.0665, "macro_f1": 0.3333333432674408, "num_tokens": 1039223.0, "repeat_count": 0.0, - "routers_loss": 0.009271817281842232, + "routers_loss": 0.00997432041913271, "skip_count": 0.0, "step": 644, "text_loss": 0.7749615907669067 @@ -6135,13 +6135,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1953125, + "grad_norm": 0.2275390625, "learning_rate": 0.0009994965332706573, - "loss": 0.0737, + "loss": 0.0755, "macro_f1": 0.3144654333591461, "num_tokens": 1042154.0, "repeat_count": 3.0, - "routers_loss": 0.10257050395011902, + "routers_loss": 0.10589150339365005, "skip_count": 0.0, "step": 646, "text_loss": 0.7812211513519287 @@ -6154,13 +6154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.1943359375, "learning_rate": 0.0009994825511614846, - "loss": 0.0363, + "loss": 0.0383, "macro_f1": 0.3272727429866791, "num_tokens": 1045250.0, "repeat_count": 0.0, - "routers_loss": 0.07091924548149109, + "routers_loss": 0.0748734176158905, "skip_count": 1.0, "step": 648, "text_loss": 0.844803512096405 @@ -6173,13 +6173,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.1220703125, "learning_rate": 0.0009994683776496562, - "loss": 0.0421, + "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 1048446.0, "repeat_count": 0.0, - "routers_loss": 0.034446243196725845, + "routers_loss": 0.03742415830492973, "skip_count": 1.0, "step": 650, "text_loss": 0.2098839282989502 @@ -6192,13 +6192,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12890625, "learning_rate": 0.0009994540127406034, - "loss": 0.0593, + "loss": 0.0591, "macro_f1": 0.32098764181137085, "num_tokens": 1051840.0, "repeat_count": 0.0, - "routers_loss": 0.06077485531568527, + "routers_loss": 0.06025516986846924, "skip_count": 2.0, "step": 652, "text_loss": 0.27727583050727844 @@ -6211,13 +6211,13 @@ "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.2294921875, + "grad_norm": 0.181640625, "learning_rate": 0.0009994394564398306, - "loss": 0.0537, + "loss": 0.0519, "macro_f1": 0.521541953086853, "num_tokens": 1055142.0, "repeat_count": 4.0, - "routers_loss": 0.2382282167673111, + "routers_loss": 0.22807340323925018, "skip_count": 2.0, "step": 654, "text_loss": 0.9672397971153259 @@ -6230,13 +6230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.130859375, "learning_rate": 0.0009994247087529158, - "loss": 0.0613, + "loss": 0.0618, "macro_f1": 0.3333333432674408, "num_tokens": 1057698.0, "repeat_count": 0.0, - "routers_loss": 0.011971636675298214, + "routers_loss": 0.01348950993269682, "skip_count": 0.0, "step": 656, "text_loss": 0.6375506520271301 @@ -6249,13 +6249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.212890625, + "grad_norm": 0.1953125, "learning_rate": 0.0009994097696855106, - "loss": 0.0414, + "loss": 0.0412, "macro_f1": 0.3333333432674408, "num_tokens": 1060624.0, "repeat_count": 0.0, - "routers_loss": 0.010221127420663834, + "routers_loss": 0.009649243205785751, "skip_count": 0.0, "step": 658, "text_loss": 0.5315385460853577 @@ -6268,13 +6268,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2265625, + "grad_norm": 0.2041015625, "learning_rate": 0.0009993946392433395, - "loss": 0.061, + "loss": 0.0609, "macro_f1": 0.307692289352417, "num_tokens": 1065076.0, "repeat_count": 0.0, - "routers_loss": 0.11860335618257523, + "routers_loss": 0.1250980943441391, "skip_count": 3.0, "step": 660, "text_loss": 0.25780341029167175 @@ -6287,13 +6287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.1640625, "learning_rate": 0.0009993793174322006, - "loss": 0.0485, + "loss": 0.0471, "macro_f1": 0.3333333432674408, "num_tokens": 1068365.0, "repeat_count": 0.0, - "routers_loss": 0.011139829643070698, + "routers_loss": 0.011544390581548214, "skip_count": 0.0, "step": 662, "text_loss": 0.34876301884651184 @@ -6306,13 +6306,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.166015625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009993638042579654, - "loss": 0.0478, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1071693.0, "repeat_count": 0.0, - "routers_loss": 0.03978770971298218, + "routers_loss": 0.03777370601892471, "skip_count": 1.0, "step": 664, "text_loss": 0.21811571717262268 @@ -6327,11 +6327,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.203125, "learning_rate": 0.0009993480997265783, - "loss": 0.0481, + "loss": 0.0475, "macro_f1": 0.5492662787437439, "num_tokens": 1074733.0, "repeat_count": 0.0, - "routers_loss": 0.051231011748313904, + "routers_loss": 0.049949806183576584, "skip_count": 2.0, "step": 666, "text_loss": 0.38410288095474243 @@ -6344,13 +6344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.10302734375, "learning_rate": 0.0009993322038440572, - "loss": 0.0615, + "loss": 0.0605, "macro_f1": 0.3333333432674408, "num_tokens": 1077993.0, "repeat_count": 0.0, - "routers_loss": 0.024917088449001312, + "routers_loss": 0.0247171800583601, "skip_count": 0.0, "step": 668, "text_loss": 0.25576895475387573 @@ -6363,13 +6363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1982421875, + "grad_norm": 0.216796875, "learning_rate": 0.000999316116616494, - "loss": 0.0627, + "loss": 0.0619, "macro_f1": 0.3333333432674408, "num_tokens": 1080491.0, "repeat_count": 0.0, - "routers_loss": 0.008834881708025932, + "routers_loss": 0.008118715137243271, "skip_count": 0.0, "step": 670, "text_loss": 0.6269792914390564 @@ -6382,13 +6382,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.21875, + "grad_norm": 0.173828125, "learning_rate": 0.0009992998380500527, "loss": 0.0462, "macro_f1": 0.3272727429866791, "num_tokens": 1083817.0, "repeat_count": 0.0, - "routers_loss": 0.033405229449272156, + "routers_loss": 0.03366057574748993, "skip_count": 1.0, "step": 672, "text_loss": 0.26891493797302246 @@ -6401,13 +6401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992833681509716, - "loss": 0.0523, + "loss": 0.0529, "macro_f1": 0.3333333432674408, "num_tokens": 1087368.0, "repeat_count": 0.0, - "routers_loss": 0.020753704011440277, + "routers_loss": 0.020552074536681175, "skip_count": 0.0, "step": 674, "text_loss": 0.14421936869621277 @@ -6420,13 +6420,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.18359375, "learning_rate": 0.0009992667069255619, - "loss": 0.0698, + "loss": 0.0696, "macro_f1": 0.31446540355682373, "num_tokens": 1090452.0, "repeat_count": 0.0, - "routers_loss": 0.06932353973388672, + "routers_loss": 0.06937336176633835, "skip_count": 2.0, "step": 676, "text_loss": 0.24999259412288666 @@ -6439,13 +6439,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.0009992498543802085, - "loss": 0.059, + "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1093996.0, "repeat_count": 1.0, - "routers_loss": 0.032903749495744705, + "routers_loss": 0.0380021296441555, "skip_count": 0.0, "step": 678, "text_loss": 0.42473849654197693 @@ -6458,32 +6458,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.2099609375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009992328105213688, - "loss": 0.0417, + "loss": 0.0411, "macro_f1": 0.4400000274181366, "num_tokens": 1096837.0, "repeat_count": 1.0, - "routers_loss": 0.19733747839927673, + "routers_loss": 0.20885063707828522, "skip_count": 4.0, "step": 680, "text_loss": 0.3829527199268341 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.2019371881420606, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.154296875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, "learning_rate": 0.0009992155753555747, - "loss": 0.0729, - "macro_f1": 0.6666666865348816, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, "num_tokens": 1100320.0, "repeat_count": 0.0, - "routers_loss": 0.013452666811645031, + "routers_loss": 0.018230699002742767, "skip_count": 2.0, "step": 682, "text_loss": 0.6190969944000244 @@ -6496,13 +6496,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.30859375, "learning_rate": 0.0009991981488894303, "loss": 0.0681, "macro_f1": 0.32098767161369324, "num_tokens": 1103682.0, "repeat_count": 0.0, - "routers_loss": 0.05302857980132103, + "routers_loss": 0.05550144240260124, "skip_count": 1.0, "step": 684, "text_loss": 0.44418027997016907 @@ -6515,13 +6515,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.2158203125, "learning_rate": 0.0009991805311296133, - "loss": 0.0527, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1106427.0, "repeat_count": 0.0, - "routers_loss": 0.08124994486570358, + "routers_loss": 0.07990608364343643, "skip_count": 2.0, "step": 686, "text_loss": 0.5577231645584106 @@ -6534,13 +6534,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.22265625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009991627220828753, - "loss": 0.0579, + "loss": 0.0568, "macro_f1": 0.32098764181137085, "num_tokens": 1109314.0, "repeat_count": 0.0, - "routers_loss": 0.058633625507354736, + "routers_loss": 0.05167485028505325, "skip_count": 2.0, "step": 688, "text_loss": 0.27325430512428284 @@ -6553,13 +6553,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.10693359375, "learning_rate": 0.0009991447217560408, - "loss": 0.0533, + "loss": 0.0521, "macro_f1": 0.5492662787437439, "num_tokens": 1112748.0, "repeat_count": 0.0, - "routers_loss": 0.04703643172979355, + "routers_loss": 0.04621964320540428, "skip_count": 2.0, "step": 690, "text_loss": 0.5288321375846863 @@ -6572,13 +6572,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.1962890625, "learning_rate": 0.000999126530156007, - "loss": 0.0485, + "loss": 0.0499, "macro_f1": 0.307692289352417, "num_tokens": 1116965.0, "repeat_count": 1.0, - "routers_loss": 0.11615128815174103, + "routers_loss": 0.11950276792049408, "skip_count": 2.0, "step": 692, "text_loss": 0.14215624332427979 @@ -6591,13 +6591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2314453125, + "grad_norm": 0.2353515625, "learning_rate": 0.0009991081472897454, - "loss": 0.0718, + "loss": 0.0722, "macro_f1": 0.3333333432674408, "num_tokens": 1120570.0, "repeat_count": 0.0, - "routers_loss": 0.017403846606612206, + "routers_loss": 0.01905500330030918, "skip_count": 0.0, "step": 694, "text_loss": 0.41862696409225464 @@ -6610,13 +6610,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1357421875, "learning_rate": 0.0009990895731643002, - "loss": 0.0444, + "loss": 0.0464, "macro_f1": 0.3272727429866791, "num_tokens": 1124009.0, "repeat_count": 1.0, - "routers_loss": 0.07067303359508514, + "routers_loss": 0.06974572688341141, "skip_count": 0.0, "step": 696, "text_loss": 0.41160130500793457 @@ -6629,13 +6629,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1923828125, "learning_rate": 0.000999070807786789, - "loss": 0.0527, + "loss": 0.0531, "macro_f1": 0.3272727429866791, "num_tokens": 1127370.0, "repeat_count": 1.0, - "routers_loss": 0.07131028175354004, + "routers_loss": 0.07055293023586273, "skip_count": 0.0, "step": 698, "text_loss": 0.48068273067474365 @@ -6648,13 +6648,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.197265625, "learning_rate": 0.000999051851164403, - "loss": 0.0629, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1130234.0, "repeat_count": 1.0, - "routers_loss": 0.1152748316526413, + "routers_loss": 0.12506946921348572, "skip_count": 1.0, "step": 700, "text_loss": 0.47925490140914917 @@ -6667,13 +6667,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.216796875, + "grad_norm": 0.1943359375, "learning_rate": 0.000999032703304406, - "loss": 0.0663, + "loss": 0.0674, "macro_f1": 0.3333333432674408, "num_tokens": 1132874.0, "repeat_count": 0.0, - "routers_loss": 0.0077212234027683735, + "routers_loss": 0.00809287466108799, "skip_count": 0.0, "step": 702, "text_loss": 0.47433632612228394 @@ -6686,13 +6686,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.099609375, + "grad_norm": 0.1064453125, "learning_rate": 0.0009990133642141358, - "loss": 0.0494, + "loss": 0.0497, "macro_f1": 0.5492662787437439, "num_tokens": 1136011.0, "repeat_count": 0.0, - "routers_loss": 0.02726336568593979, + "routers_loss": 0.0319170281291008, "skip_count": 2.0, "step": 704, "text_loss": 0.6574832201004028 @@ -6705,13 +6705,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.32421875, + "grad_norm": 0.33984375, "learning_rate": 0.000998993833901003, - "loss": 0.0615, + "loss": 0.0619, "macro_f1": 0.32098764181137085, "num_tokens": 1139674.0, "repeat_count": 0.0, - "routers_loss": 0.0958542674779892, + "routers_loss": 0.09850362688302994, "skip_count": 2.0, "step": 706, "text_loss": 0.7660127282142639 @@ -6724,13 +6724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.12158203125, "learning_rate": 0.0009989741123724919, - "loss": 0.0583, + "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1143558.0, "repeat_count": 0.0, - "routers_loss": 0.007100600749254227, + "routers_loss": 0.006673311349004507, "skip_count": 0.0, "step": 708, "text_loss": 0.5976111888885498 @@ -6743,13 +6743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009989541996361594, - "loss": 0.0445, + "loss": 0.045, "macro_f1": 0.3333333432674408, "num_tokens": 1146122.0, "repeat_count": 0.0, - "routers_loss": 0.0047812811098992825, + "routers_loss": 0.004988791421055794, "skip_count": 0.0, "step": 710, "text_loss": 0.5256119966506958 @@ -6762,13 +6762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009989340956996367, - "loss": 0.052, + "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 1149546.0, "repeat_count": 0.0, - "routers_loss": 0.006643407512456179, + "routers_loss": 0.0067769973538815975, "skip_count": 0.0, "step": 712, "text_loss": 0.5040497779846191 @@ -6781,13 +6781,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2890625, + "grad_norm": 0.26953125, "learning_rate": 0.0009989138005706273, - "loss": 0.0719, + "loss": 0.0735, "macro_f1": 0.32098764181137085, "num_tokens": 1153195.0, "repeat_count": 0.0, - "routers_loss": 0.0910436138510704, + "routers_loss": 0.09899546951055527, "skip_count": 2.0, "step": 714, "text_loss": 0.20803412795066833 @@ -6800,13 +6800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1396484375, "learning_rate": 0.000998893314256908, - "loss": 0.0649, + "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 1157081.0, "repeat_count": 0.0, - "routers_loss": 0.010978946462273598, + "routers_loss": 0.010492355562746525, "skip_count": 0.0, "step": 716, "text_loss": 0.23077639937400818 @@ -6819,13 +6819,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1298828125, "learning_rate": 0.0009988726367663298, - "loss": 0.0543, + "loss": 0.0539, "macro_f1": 0.3333333432674408, "num_tokens": 1160079.0, "repeat_count": 0.0, - "routers_loss": 0.009956461377441883, + "routers_loss": 0.01063773687928915, "skip_count": 0.0, "step": 718, "text_loss": 0.6085864901542664 @@ -6838,13 +6838,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1640625, "learning_rate": 0.0009988517681068163, - "loss": 0.0412, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1163249.0, "repeat_count": 1.0, - "routers_loss": 0.057210199534893036, + "routers_loss": 0.05981874838471413, "skip_count": 0.0, "step": 720, "text_loss": 0.4047050476074219 @@ -6857,32 +6857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009988307082863638, - "loss": 0.0364, + "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 1166259.0, "repeat_count": 0.0, - "routers_loss": 0.01035996899008751, + "routers_loss": 0.009750043973326683, "skip_count": 0.0, "step": 722, "text_loss": 0.5306474566459656 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.3991781626063986, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.2412109375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, "learning_rate": 0.0009988094573130434, - "loss": 0.0661, - "macro_f1": 0.3076923191547394, + "loss": 0.063, + "macro_f1": 0.5359477400779724, "num_tokens": 1168887.0, "repeat_count": 2.0, - "routers_loss": 0.18087820708751678, + "routers_loss": 0.18601104617118835, "skip_count": 2.0, "step": 724, "text_loss": 0.53528892993927 @@ -6895,32 +6895,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.142578125, "learning_rate": 0.0009987880151949974, - "loss": 0.0505, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1172625.0, "repeat_count": 0.0, - "routers_loss": 0.04720238968729973, + "routers_loss": 0.02845010720193386, "skip_count": 1.0, "step": 726, "text_loss": 0.4760453701019287 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 3.417963017317288, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.2216796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, "learning_rate": 0.0009987663819404434, - "loss": 0.0603, - "macro_f1": 0.6666666865348816, + "loss": 0.06, + "macro_f1": 0.5492662787437439, "num_tokens": 1176580.0, "repeat_count": 0.0, - "routers_loss": 0.015407778322696686, + "routers_loss": 0.017596980556845665, "skip_count": 2.0, "step": 728, "text_loss": 0.5146099328994751 @@ -6933,13 +6933,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1318359375, "learning_rate": 0.000998744557557671, - "loss": 0.0489, + "loss": 0.0484, "macro_f1": 0.3272727429866791, "num_tokens": 1179804.0, "repeat_count": 0.0, - "routers_loss": 0.060891781002283096, + "routers_loss": 0.0625474750995636, "skip_count": 1.0, "step": 730, "text_loss": 0.27738022804260254 @@ -6947,18 +6947,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.436747872028177, - "f1_execute": 0.943396270275116, + "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.203125, "learning_rate": 0.0009987225420550433, - "loss": 0.0825, - "macro_f1": 0.3144654333591461, + "loss": 0.0796, + "macro_f1": 0.307692289352417, "num_tokens": 1182658.0, "repeat_count": 1.0, - "routers_loss": 0.1661442220211029, + "routers_loss": 0.16188351809978485, "skip_count": 2.0, "step": 732, "text_loss": 0.23231445252895355 @@ -6966,18 +6966,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.446140299383622, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.2001953125, "learning_rate": 0.0009987003354409965, - "loss": 0.0634, - "macro_f1": 0.3333333432674408, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, "num_tokens": 1185451.0, "repeat_count": 0.0, - "routers_loss": 0.02108248695731163, + "routers_loss": 0.02391529455780983, "skip_count": 0.0, "step": 734, "text_loss": 0.4496627151966095 @@ -6990,13 +6990,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.248046875, + "grad_norm": 0.234375, "learning_rate": 0.0009986779377240405, - "loss": 0.0534, + "loss": 0.0513, "macro_f1": 0.32098767161369324, "num_tokens": 1188666.0, "repeat_count": 0.0, - "routers_loss": 0.08318125456571579, + "routers_loss": 0.08435963839292526, "skip_count": 1.0, "step": 736, "text_loss": 0.4950787127017975 @@ -7009,13 +7009,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11962890625, + "grad_norm": 0.1220703125, "learning_rate": 0.000998655348912758, - "loss": 0.0514, + "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 1193035.0, "repeat_count": 0.0, - "routers_loss": 0.015889234840869904, + "routers_loss": 0.01648722216486931, "skip_count": 0.0, "step": 738, "text_loss": 0.24761848151683807 @@ -7028,13 +7028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1513671875, "learning_rate": 0.0009986325690158051, "loss": 0.0435, "macro_f1": 0.3333333432674408, "num_tokens": 1196840.0, "repeat_count": 0.0, - "routers_loss": 0.01378484908491373, + "routers_loss": 0.013143910095095634, "skip_count": 0.0, "step": 740, "text_loss": 0.15662719309329987 @@ -7047,13 +7047,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009986095980419113, - "loss": 0.076, + "loss": 0.0757, "macro_f1": 0.3333333432674408, "num_tokens": 1200573.0, "repeat_count": 0.0, - "routers_loss": 0.02673683874309063, + "routers_loss": 0.026706280186772346, "skip_count": 0.0, "step": 742, "text_loss": 0.16725164651870728 @@ -7066,13 +7066,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.185546875, + "grad_norm": 0.1982421875, "learning_rate": 0.0009985864359998787, - "loss": 0.0778, + "loss": 0.0795, "macro_f1": 0.3006536364555359, "num_tokens": 1203589.0, "repeat_count": 2.0, - "routers_loss": 0.27776041626930237, + "routers_loss": 0.28607678413391113, "skip_count": 3.0, "step": 744, "text_loss": 0.6350882053375244 @@ -7085,13 +7085,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009985630828985835, - "loss": 0.0575, + "loss": 0.0572, "macro_f1": 0.3272727429866791, "num_tokens": 1206422.0, "repeat_count": 0.0, - "routers_loss": 0.0575483962893486, + "routers_loss": 0.05685260891914368, "skip_count": 1.0, "step": 746, "text_loss": 0.33779552578926086 @@ -7104,13 +7104,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009985395387469742, - "loss": 0.0478, + "loss": 0.0458, "macro_f1": 0.5492662787437439, "num_tokens": 1211588.0, "repeat_count": 0.0, - "routers_loss": 0.0458797849714756, + "routers_loss": 0.0437830351293087, "skip_count": 2.0, "step": 748, "text_loss": 0.28664472699165344 @@ -7123,13 +7123,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.15625, "learning_rate": 0.0009985158035540735, - "loss": 0.0701, + "loss": 0.0714, "macro_f1": 0.32098764181137085, "num_tokens": 1214580.0, "repeat_count": 2.0, - "routers_loss": 0.07850238680839539, + "routers_loss": 0.07074898481369019, "skip_count": 0.0, "step": 750, "text_loss": 0.3939313292503357 @@ -7142,13 +7142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.21484375, "learning_rate": 0.0009984918773289762, - "loss": 0.0702, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1217388.0, "repeat_count": 0.0, - "routers_loss": 0.009507967159152031, + "routers_loss": 0.009757856838405132, "skip_count": 0.0, "step": 752, "text_loss": 0.37641215324401855 @@ -7161,13 +7161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.140625, "learning_rate": 0.0009984677600808512, - "loss": 0.0543, + "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1219960.0, "repeat_count": 0.0, - "routers_loss": 0.02620997279882431, + "routers_loss": 0.02515069581568241, "skip_count": 0.0, "step": 754, "text_loss": 0.155938982963562 @@ -7180,13 +7180,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.30078125, "learning_rate": 0.0009984434518189405, - "loss": 0.0791, + "loss": 0.0764, "macro_f1": 0.3333333432674408, "num_tokens": 1223234.0, "repeat_count": 0.0, - "routers_loss": 0.02798631228506565, + "routers_loss": 0.025766927748918533, "skip_count": 0.0, "step": 756, "text_loss": 0.691118061542511 @@ -7201,11 +7201,11 @@ "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0009984189525525584, - "loss": 0.046, + "loss": 0.0451, "macro_f1": 0.5359477400779724, "num_tokens": 1225764.0, "repeat_count": 2.0, - "routers_loss": 0.16614431142807007, + "routers_loss": 0.1782722771167755, "skip_count": 2.0, "step": 758, "text_loss": 0.3592209219932556 @@ -7218,13 +7218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.193359375, + "grad_norm": 0.189453125, "learning_rate": 0.0009983942622910935, - "loss": 0.0669, + "loss": 0.0659, "macro_f1": 0.3333333432674408, "num_tokens": 1230097.0, "repeat_count": 0.0, - "routers_loss": 0.008541896007955074, + "routers_loss": 0.00825568474829197, "skip_count": 0.0, "step": 760, "text_loss": 0.4646475315093994 @@ -7237,13 +7237,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1962890625, "learning_rate": 0.0009983693810440074, - "loss": 0.0478, + "loss": 0.0477, "macro_f1": 0.32098764181137085, "num_tokens": 1233140.0, "repeat_count": 0.0, - "routers_loss": 0.045411624014377594, + "routers_loss": 0.04156976938247681, "skip_count": 2.0, "step": 762, "text_loss": 0.298682302236557 @@ -7256,13 +7256,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.380859375, + "grad_norm": 0.3515625, "learning_rate": 0.000998344308820834, - "loss": 0.0689, + "loss": 0.0666, "macro_f1": 0.3272727429866791, "num_tokens": 1236305.0, "repeat_count": 0.0, - "routers_loss": 0.052299100905656815, + "routers_loss": 0.05697929114103317, "skip_count": 1.0, "step": 764, "text_loss": 0.5249121189117432 @@ -7275,13 +7275,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.18359375, "learning_rate": 0.0009983190456311817, - "loss": 0.0602, + "loss": 0.0592, "macro_f1": 0.3144654333591461, "num_tokens": 1239673.0, "repeat_count": 0.0, - "routers_loss": 0.09140212833881378, + "routers_loss": 0.09547408670186996, "skip_count": 3.0, "step": 766, "text_loss": 0.41277334094047546 @@ -7294,13 +7294,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.201171875, + "grad_norm": 0.185546875, "learning_rate": 0.000998293591484731, - "loss": 0.0475, + "loss": 0.0484, "macro_f1": 0.5492662787437439, "num_tokens": 1242292.0, "repeat_count": 0.0, - "routers_loss": 0.030750583857297897, + "routers_loss": 0.030693158507347107, "skip_count": 2.0, "step": 768, "text_loss": 0.1583656519651413 @@ -7313,13 +7313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.15234375, "learning_rate": 0.000998267946391236, - "loss": 0.052, + "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 1244661.0, "repeat_count": 0.0, - "routers_loss": 0.010202950797975063, + "routers_loss": 0.01211300864815712, "skip_count": 0.0, "step": 770, "text_loss": 0.4629349112510681 @@ -7332,13 +7332,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0927734375, "learning_rate": 0.0009982421103605238, - "loss": 0.0434, + "loss": 0.0441, "macro_f1": 0.32098764181137085, "num_tokens": 1248688.0, "repeat_count": 0.0, - "routers_loss": 0.07364192605018616, + "routers_loss": 0.0665968507528305, "skip_count": 2.0, "step": 772, "text_loss": 0.4019293785095215 @@ -7353,11 +7353,11 @@ "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.000998216083402495, - "loss": 0.0606, + "loss": 0.0613, "macro_f1": 0.32098764181137085, "num_tokens": 1251395.0, "repeat_count": 0.0, - "routers_loss": 0.06553081423044205, + "routers_loss": 0.07186859846115112, "skip_count": 2.0, "step": 774, "text_loss": 0.4659276604652405 @@ -7370,13 +7370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.302734375, "learning_rate": 0.0009981898655271235, - "loss": 0.0475, + "loss": 0.0488, "macro_f1": 0.3333333432674408, "num_tokens": 1254888.0, "repeat_count": 0.0, - "routers_loss": 0.008751659654080868, + "routers_loss": 0.007823926396667957, "skip_count": 0.0, "step": 776, "text_loss": 0.5160359740257263 @@ -7389,13 +7389,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.12060546875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009981634567444557, - "loss": 0.0777, + "loss": 0.0775, "macro_f1": 0.590062141418457, "num_tokens": 1258250.0, "repeat_count": 3.0, - "routers_loss": 0.24522721767425537, + "routers_loss": 0.24624499678611755, "skip_count": 4.0, "step": 778, "text_loss": 0.29319918155670166 @@ -7408,13 +7408,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.0009981368570646115, "loss": 0.0885, "macro_f1": 0.3272727429866791, "num_tokens": 1260916.0, "repeat_count": 0.0, - "routers_loss": 0.03767623379826546, + "routers_loss": 0.030730176717042923, "skip_count": 1.0, "step": 780, "text_loss": 0.624981164932251 @@ -7427,13 +7427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009981100664977838, - "loss": 0.0708, + "loss": 0.0699, "macro_f1": 0.3333333432674408, "num_tokens": 1264004.0, "repeat_count": 0.0, - "routers_loss": 0.006098059006035328, + "routers_loss": 0.006829176563769579, "skip_count": 0.0, "step": 782, "text_loss": 0.6137266159057617 @@ -7446,13 +7446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980830850542391, - "loss": 0.0589, + "loss": 0.058, "macro_f1": 0.3333333432674408, "num_tokens": 1267130.0, "repeat_count": 0.0, - "routers_loss": 0.01731623336672783, + "routers_loss": 0.018471000716090202, "skip_count": 0.0, "step": 784, "text_loss": 0.15213175117969513 @@ -7465,13 +7465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2294921875, + "grad_norm": 0.2353515625, "learning_rate": 0.0009980559127443166, - "loss": 0.0526, + "loss": 0.052, "macro_f1": 0.3333333432674408, "num_tokens": 1271129.0, "repeat_count": 0.0, - "routers_loss": 0.0076471962966024876, + "routers_loss": 0.007903140969574451, "skip_count": 0.0, "step": 786, "text_loss": 0.5768613219261169 @@ -7484,13 +7484,13 @@ "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.130859375, "learning_rate": 0.000998028549578429, - "loss": 0.0745, + "loss": 0.0719, "macro_f1": 0.307692289352417, "num_tokens": 1274232.0, "repeat_count": 0.0, - "routers_loss": 0.0637628585100174, + "routers_loss": 0.06737866252660751, "skip_count": 3.0, "step": 788, "text_loss": 0.2877073585987091 @@ -7503,13 +7503,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009980009955670615, - "loss": 0.0699, + "loss": 0.0698, "macro_f1": 0.3144654333591461, "num_tokens": 1277193.0, "repeat_count": 0.0, - "routers_loss": 0.10882514715194702, + "routers_loss": 0.10194934904575348, "skip_count": 3.0, "step": 790, "text_loss": 0.11860492825508118 @@ -7522,13 +7522,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.126953125, "learning_rate": 0.000997973250720773, - "loss": 0.056, + "loss": 0.0552, "macro_f1": 0.32098764181137085, "num_tokens": 1280960.0, "repeat_count": 0.0, - "routers_loss": 0.10924118757247925, + "routers_loss": 0.10297708213329315, "skip_count": 2.0, "step": 792, "text_loss": 0.13477706909179688 @@ -7541,13 +7541,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.1611328125, "learning_rate": 0.0009979453150501954, - "loss": 0.0664, + "loss": 0.0663, "macro_f1": 0.32098764181137085, "num_tokens": 1284611.0, "repeat_count": 1.0, - "routers_loss": 0.06571807712316513, + "routers_loss": 0.06122037023305893, "skip_count": 1.0, "step": 794, "text_loss": 0.40569379925727844 @@ -7560,13 +7560,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1279296875, "learning_rate": 0.000997917188566034, - "loss": 0.0616, + "loss": 0.062, "macro_f1": 0.32098764181137085, "num_tokens": 1287834.0, "repeat_count": 0.0, - "routers_loss": 0.058966971933841705, + "routers_loss": 0.061135001480579376, "skip_count": 2.0, "step": 796, "text_loss": 0.2829287648200989 @@ -7579,32 +7579,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.109375, "learning_rate": 0.0009978888712790664, - "loss": 0.067, + "loss": 0.0654, "macro_f1": 0.3272727429866791, "num_tokens": 1291666.0, "repeat_count": 0.0, - "routers_loss": 0.04844636470079422, + "routers_loss": 0.04841872677206993, "skip_count": 1.0, "step": 798, "text_loss": 1.011757254600525 }, { "acc_repeat": 0.0, - "acc_skip": 0.4000000059604645, - "avg_layers": 26.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, "epoch": 3.756090402113296, - "f1_execute": 0.9166666865348816, + "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, - "f1_skip": 0.5714285969734192, - "grad_norm": 0.1416015625, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, "learning_rate": 0.0009978603632001444, - "loss": 0.0634, - "macro_f1": 0.4960317611694336, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, "num_tokens": 1294627.0, "repeat_count": 1.0, - "routers_loss": 0.1591777801513672, + "routers_loss": 0.15698759257793427, "skip_count": 5.0, "step": 800, "text_loss": 0.4457623362541199 @@ -7617,13 +7617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2734375, + "grad_norm": 0.283203125, "learning_rate": 0.0009978316643401916, - "loss": 0.0694, + "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1297711.0, "repeat_count": 0.0, - "routers_loss": 0.017735568806529045, + "routers_loss": 0.018952010199427605, "skip_count": 0.0, "step": 802, "text_loss": 0.2069481462240219 @@ -7636,13 +7636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.14453125, "learning_rate": 0.0009978027747102062, - "loss": 0.0477, + "loss": 0.0479, "macro_f1": 0.3333333432674408, "num_tokens": 1300569.0, "repeat_count": 0.0, - "routers_loss": 0.012401525862514973, + "routers_loss": 0.014538386836647987, "skip_count": 0.0, "step": 804, "text_loss": 0.4983852505683899 @@ -7655,13 +7655,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2080078125, + "grad_norm": 0.2109375, "learning_rate": 0.0009977736943212584, - "loss": 0.0735, + "loss": 0.0721, "macro_f1": 0.32098764181137085, "num_tokens": 1303969.0, "repeat_count": 0.0, - "routers_loss": 0.10736164450645447, + "routers_loss": 0.11164087057113647, "skip_count": 2.0, "step": 806, "text_loss": 0.2910642921924591 @@ -7674,13 +7674,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2001953125, + "grad_norm": 0.1826171875, "learning_rate": 0.000997744423184492, - "loss": 0.0428, + "loss": 0.0424, "macro_f1": 0.3272727429866791, "num_tokens": 1307263.0, "repeat_count": 0.0, - "routers_loss": 0.0595436617732048, + "routers_loss": 0.06073406711220741, "skip_count": 1.0, "step": 808, "text_loss": 0.18831779062747955 @@ -7693,13 +7693,13 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.240234375, + "grad_norm": 0.26171875, "learning_rate": 0.0009977149613111236, - "loss": 0.0494, + "loss": 0.0486, "macro_f1": 0.4400000274181366, "num_tokens": 1309953.0, "repeat_count": 1.0, - "routers_loss": 0.12617000937461853, + "routers_loss": 0.11035524308681488, "skip_count": 4.0, "step": 810, "text_loss": 0.7872759699821472 @@ -7712,13 +7712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1669921875, + "grad_norm": 0.1650390625, "learning_rate": 0.0009976853087124433, - "loss": 0.0537, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1313243.0, "repeat_count": 0.0, - "routers_loss": 0.021242506802082062, + "routers_loss": 0.021804286167025566, "skip_count": 0.0, "step": 812, "text_loss": 0.22349292039871216 @@ -7731,13 +7731,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.318359375, + "grad_norm": 0.28125, "learning_rate": 0.0009976554653998138, - "loss": 0.0617, + "loss": 0.0612, "macro_f1": 0.31446540355682373, "num_tokens": 1316165.0, "repeat_count": 0.0, - "routers_loss": 0.10387415438890457, + "routers_loss": 0.10715524107217789, "skip_count": 2.0, "step": 814, "text_loss": 0.18035532534122467 @@ -7750,13 +7750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.1279296875, "learning_rate": 0.000997625431384671, - "loss": 0.0565, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1319206.0, "repeat_count": 0.0, - "routers_loss": 0.007816939614713192, + "routers_loss": 0.007173649035394192, "skip_count": 0.0, "step": 816, "text_loss": 0.48928648233413696 @@ -7769,13 +7769,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009975952066785243, - "loss": 0.0654, + "loss": 0.0655, "macro_f1": 0.3006536364555359, "num_tokens": 1322549.0, "repeat_count": 1.0, - "routers_loss": 0.22526368498802185, + "routers_loss": 0.22308112680912018, "skip_count": 4.0, "step": 818, "text_loss": 0.5211259722709656 @@ -7788,13 +7788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.1337890625, "learning_rate": 0.0009975647912929557, - "loss": 0.056, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1325213.0, "repeat_count": 0.0, - "routers_loss": 0.010998851619660854, + "routers_loss": 0.00998698640614748, "skip_count": 0.0, "step": 820, "text_loss": 0.7117052674293518 @@ -7807,13 +7807,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.15234375, "learning_rate": 0.0009975341852396205, - "loss": 0.0712, + "loss": 0.0723, "macro_f1": 0.32098764181137085, "num_tokens": 1328383.0, "repeat_count": 0.0, - "routers_loss": 0.07115054875612259, + "routers_loss": 0.07454588264226913, "skip_count": 2.0, "step": 822, "text_loss": 0.34539610147476196 @@ -7826,13 +7826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1552734375, + "grad_norm": 0.1630859375, "learning_rate": 0.0009975033885302469, - "loss": 0.0611, + "loss": 0.0604, "macro_f1": 0.3333333432674408, "num_tokens": 1331406.0, "repeat_count": 0.0, - "routers_loss": 0.008062695153057575, + "routers_loss": 0.009157589636743069, "skip_count": 0.0, "step": 824, "text_loss": 0.7484824657440186 @@ -7845,13 +7845,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1923828125, "learning_rate": 0.0009974724011766363, - "loss": 0.0496, + "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 1334410.0, "repeat_count": 1.0, - "routers_loss": 0.16666285693645477, + "routers_loss": 0.17149391770362854, "skip_count": 0.0, "step": 826, "text_loss": 0.5913820266723633 @@ -7864,13 +7864,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1708984375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009974412231906632, - "loss": 0.0567, + "loss": 0.058, "macro_f1": 0.32098764181137085, "num_tokens": 1337653.0, "repeat_count": 1.0, - "routers_loss": 0.0908689796924591, + "routers_loss": 0.09743282198905945, "skip_count": 1.0, "step": 828, "text_loss": 0.2505693733692169 @@ -7883,13 +7883,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16015625, + "grad_norm": 0.1533203125, "learning_rate": 0.0009974098545842748, - "loss": 0.0648, + "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 1340860.0, "repeat_count": 0.0, - "routers_loss": 0.04364728182554245, + "routers_loss": 0.041490405797958374, "skip_count": 1.0, "step": 830, "text_loss": 0.5585370063781738 @@ -7897,18 +7897,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 3.906369239800411, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2060546875, + "grad_norm": 0.193359375, "learning_rate": 0.0009973782953694918, - "loss": 0.0772, - "macro_f1": 0.3076923191547394, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, "num_tokens": 1344232.0, "repeat_count": 1.0, - "routers_loss": 0.15315109491348267, + "routers_loss": 0.16080693900585175, "skip_count": 3.0, "step": 832, "text_loss": 0.4782734513282776 @@ -7921,13 +7921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.1298828125, "learning_rate": 0.000997346545558408, - "loss": 0.0527, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1347667.0, "repeat_count": 0.0, - "routers_loss": 0.01342768594622612, + "routers_loss": 0.01173500344157219, "skip_count": 0.0, "step": 834, "text_loss": 0.25036177039146423 @@ -7940,13 +7940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.173828125, "learning_rate": 0.0009973146051631895, - "loss": 0.0513, + "loss": 0.0522, "macro_f1": 0.3333333432674408, "num_tokens": 1350707.0, "repeat_count": 0.0, - "routers_loss": 0.01158806961029768, + "routers_loss": 0.011477196589112282, "skip_count": 0.0, "step": 836, "text_loss": 0.5482863187789917 @@ -7959,13 +7959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1650390625, "learning_rate": 0.0009972824741960764, - "loss": 0.0549, + "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1353704.0, "repeat_count": 0.0, - "routers_loss": 0.01255605649203062, + "routers_loss": 0.010528896935284138, "skip_count": 0.0, "step": 838, "text_loss": 0.6732596158981323 @@ -7978,13 +7978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.1181640625, "learning_rate": 0.000997250152669381, - "loss": 0.0578, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1356608.0, "repeat_count": 0.0, - "routers_loss": 0.010225459933280945, + "routers_loss": 0.010678744874894619, "skip_count": 0.0, "step": 840, "text_loss": 0.5479338765144348 @@ -7997,13 +7997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.181640625, "learning_rate": 0.000997217640595489, - "loss": 0.0633, + "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1359809.0, "repeat_count": 0.0, - "routers_loss": 0.007837744429707527, + "routers_loss": 0.00835978239774704, "skip_count": 0.0, "step": 842, "text_loss": 0.42543259263038635 @@ -8016,13 +8016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.1923828125, "learning_rate": 0.0009971849379868593, - "loss": 0.0674, + "loss": 0.0653, "macro_f1": 0.3333333432674408, "num_tokens": 1362201.0, "repeat_count": 0.0, - "routers_loss": 0.008631376549601555, + "routers_loss": 0.009930923581123352, "skip_count": 0.0, "step": 844, "text_loss": 0.720462441444397 @@ -8035,13 +8035,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.1123046875, "learning_rate": 0.0009971520448560235, - "loss": 0.0612, + "loss": 0.0615, "macro_f1": 0.3272727429866791, "num_tokens": 1365790.0, "repeat_count": 0.0, - "routers_loss": 0.06206027418375015, + "routers_loss": 0.06344373524188995, "skip_count": 1.0, "step": 846, "text_loss": 0.8423607349395752 @@ -8049,18 +8049,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 3.9815086586439685, - "f1_execute": 0.9411765336990356, + "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.16015625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, "learning_rate": 0.000997118961215586, - "loss": 0.0678, - "macro_f1": 0.480392187833786, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, "num_tokens": 1368387.0, "repeat_count": 1.0, - "routers_loss": 0.1463794708251953, + "routers_loss": 0.14688406884670258, "skip_count": 3.0, "step": 848, "text_loss": 0.3933577537536621 @@ -8073,13 +8073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2451171875, + "grad_norm": 0.263671875, "learning_rate": 0.000997085687078225, - "loss": 0.052, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1371189.0, "repeat_count": 0.0, - "routers_loss": 0.01140492781996727, + "routers_loss": 0.009953443892300129, "skip_count": 0.0, "step": 850, "text_loss": 0.41469162702560425 @@ -8092,13 +8092,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.15625, "learning_rate": 0.0009970522224566909, - "loss": 0.0563, + "loss": 0.0555, "macro_f1": 0.32098767161369324, "num_tokens": 1374008.0, "repeat_count": 0.0, - "routers_loss": 0.05136030167341232, + "routers_loss": 0.048870690166950226, "skip_count": 1.0, "step": 852, "text_loss": 0.613615870475769 @@ -8111,32 +8111,32 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.25390625, + "grad_norm": 0.283203125, "learning_rate": 0.0009970185673638075, - "loss": 0.0627, + "loss": 0.0629, "macro_f1": 0.32098764181137085, "num_tokens": 1376662.0, "repeat_count": 1.0, - "routers_loss": 0.07274381071329117, + "routers_loss": 0.06865929812192917, "skip_count": 1.0, "step": 854, "text_loss": 0.4392736256122589 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 4.01878485471089, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, "learning_rate": 0.0009969847218124716, - "loss": 0.0503, - "macro_f1": 0.3272727429866791, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, "num_tokens": 1380049.0, "repeat_count": 0.0, - "routers_loss": 0.024335317313671112, + "routers_loss": 0.02382219396531582, "skip_count": 1.0, "step": 856, "text_loss": 0.19115346670150757 @@ -8149,13 +8149,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.240234375, + "grad_norm": 0.1884765625, "learning_rate": 0.0009969506858156527, - "loss": 0.0359, + "loss": 0.0344, "macro_f1": 0.3272727429866791, "num_tokens": 1383008.0, "repeat_count": 0.0, - "routers_loss": 0.046614740043878555, + "routers_loss": 0.03907281160354614, "skip_count": 1.0, "step": 858, "text_loss": 0.34842637181282043 @@ -8168,13 +8168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.12060546875, "learning_rate": 0.0009969164593863935, - "loss": 0.0372, + "loss": 0.0365, "macro_f1": 0.3333333432674408, "num_tokens": 1387051.0, "repeat_count": 0.0, - "routers_loss": 0.006380240898579359, + "routers_loss": 0.007645803038030863, "skip_count": 0.0, "step": 860, "text_loss": 0.3810436725616455 @@ -8187,13 +8187,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1484375, "learning_rate": 0.0009968820425378098, - "loss": 0.0473, + "loss": 0.0463, "macro_f1": 0.3272727429866791, "num_tokens": 1390244.0, "repeat_count": 1.0, - "routers_loss": 0.04770716652274132, + "routers_loss": 0.04435238987207413, "skip_count": 0.0, "step": 862, "text_loss": 0.34853485226631165 @@ -8206,32 +8206,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.3359375, + "grad_norm": 0.28515625, "learning_rate": 0.00099684743528309, - "loss": 0.0434, + "loss": 0.0424, "macro_f1": 0.3333333432674408, "num_tokens": 1392976.0, "repeat_count": 0.0, - "routers_loss": 0.006983708590269089, + "routers_loss": 0.006071661598980427, "skip_count": 0.0, "step": 864, "text_loss": 0.6395178437232971 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.065746991488113, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.080078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, "learning_rate": 0.0009968126376354958, - "loss": 0.0476, - "macro_f1": 0.32098764181137085, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, "num_tokens": 1396061.0, "repeat_count": 0.0, - "routers_loss": 0.046313900500535965, + "routers_loss": 0.05011235550045967, "skip_count": 2.0, "step": 866, "text_loss": 0.09103966504335403 @@ -8244,32 +8244,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009967776496083616, "loss": 0.0509, "macro_f1": 0.3272727429866791, "num_tokens": 1398993.0, "repeat_count": 1.0, - "routers_loss": 0.0401870422065258, + "routers_loss": 0.03979124873876572, "skip_count": 0.0, "step": 868, "text_loss": 0.27257058024406433 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 4.084531846199002, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, "learning_rate": 0.000996742471215095, - "loss": 0.0505, - "macro_f1": 0.32098764181137085, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, "num_tokens": 1402080.0, "repeat_count": 0.0, - "routers_loss": 0.03313451260328293, + "routers_loss": 0.030823837965726852, "skip_count": 2.0, "step": 870, "text_loss": 0.7047103047370911 @@ -8282,13 +8282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009967071024691763, - "loss": 0.0468, + "loss": 0.0461, "macro_f1": 0.3333333432674408, "num_tokens": 1404890.0, "repeat_count": 0.0, - "routers_loss": 0.010118982754647732, + "routers_loss": 0.009721715934574604, "skip_count": 0.0, "step": 872, "text_loss": 0.959106981754303 @@ -8301,13 +8301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1142578125, "learning_rate": 0.000996671543384159, - "loss": 0.0498, + "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 1407853.0, "repeat_count": 0.0, - "routers_loss": 0.005856200121343136, + "routers_loss": 0.006025883834809065, "skip_count": 0.0, "step": 874, "text_loss": 0.47571972012519836 @@ -8320,13 +8320,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.09765625, "learning_rate": 0.0009966357939736692, - "loss": 0.0417, + "loss": 0.0416, "macro_f1": 0.3272727429866791, "num_tokens": 1410723.0, "repeat_count": 0.0, - "routers_loss": 0.02768322452902794, + "routers_loss": 0.025964925065636635, "skip_count": 0.0, "step": 876, "text_loss": 0.4964611530303955 @@ -8339,13 +8339,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.09423828125, "learning_rate": 0.0009965998542514065, - "loss": 0.0419, + "loss": 0.0415, "macro_f1": 0.32098764181137085, "num_tokens": 1414008.0, "repeat_count": 0.0, - "routers_loss": 0.09382032603025436, + "routers_loss": 0.09509637206792831, "skip_count": 2.0, "step": 878, "text_loss": 0.621494710445404 @@ -8358,32 +8358,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.103515625, + "grad_norm": 0.11083984375, "learning_rate": 0.0009965637242311427, - "loss": 0.0466, + "loss": 0.0472, "macro_f1": 0.542222261428833, "num_tokens": 1417447.0, "repeat_count": 0.0, - "routers_loss": 0.026867631822824478, + "routers_loss": 0.02520318515598774, "skip_count": 4.0, "step": 880, "text_loss": 0.40209758281707764 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 4.14088641033167, - "f1_execute": 0.95652174949646, + "f1_execute": 0.936170220375061, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.26171875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, "learning_rate": 0.000996527403926723, - "loss": 0.0496, - "macro_f1": 0.5855072736740112, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, "num_tokens": 1419905.0, "repeat_count": 0.0, - "routers_loss": 0.12731307744979858, + "routers_loss": 0.13183781504631042, "skip_count": 6.0, "step": 882, "text_loss": 0.642185389995575 @@ -8396,13 +8396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1201171875, "learning_rate": 0.0009964908933520655, - "loss": 0.039, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 1423436.0, "repeat_count": 0.0, - "routers_loss": 0.008483970537781715, + "routers_loss": 0.009429510682821274, "skip_count": 0.0, "step": 884, "text_loss": 0.48232755064964294 @@ -8415,13 +8415,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.18359375, + "grad_norm": 0.1669921875, "learning_rate": 0.0009964541925211613, - "loss": 0.0348, + "loss": 0.0349, "macro_f1": 0.32098764181137085, "num_tokens": 1426842.0, "repeat_count": 0.0, - "routers_loss": 0.07847871631383896, + "routers_loss": 0.07629609107971191, "skip_count": 2.0, "step": 886, "text_loss": 0.16620934009552002 @@ -8434,13 +8434,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.0927734375, "learning_rate": 0.0009964173014480738, - "loss": 0.036, + "loss": 0.0348, "macro_f1": 0.5492662787437439, "num_tokens": 1430430.0, "repeat_count": 0.0, - "routers_loss": 0.04574459046125412, + "routers_loss": 0.036814019083976746, "skip_count": 2.0, "step": 888, "text_loss": 0.4866008758544922 @@ -8453,13 +8453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1123046875, "learning_rate": 0.0009963802201469398, - "loss": 0.0485, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1433821.0, "repeat_count": 0.0, - "routers_loss": 0.004683624487370253, + "routers_loss": 0.0041250260546803474, "skip_count": 0.0, "step": 890, "text_loss": 0.578216552734375 @@ -8472,13 +8472,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2158203125, + "grad_norm": 0.2373046875, "learning_rate": 0.0009963429486319693, - "loss": 0.0476, + "loss": 0.0463, "macro_f1": 0.32098764181137085, "num_tokens": 1436976.0, "repeat_count": 0.0, - "routers_loss": 0.06499828398227692, + "routers_loss": 0.06213559955358505, "skip_count": 2.0, "step": 892, "text_loss": 0.221701517701149 @@ -8486,18 +8486,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 4.197240974464338, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.310546875, + "f1_skip": 0.5, + "grad_norm": 0.361328125, "learning_rate": 0.0009963054869174446, - "loss": 0.0326, - "macro_f1": 0.44705885648727417, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, "num_tokens": 1440397.0, "repeat_count": 0.0, - "routers_loss": 0.08285653591156006, + "routers_loss": 0.07532428950071335, "skip_count": 2.0, "step": 894, "text_loss": 0.6922838091850281 @@ -8510,13 +8510,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1572265625, "learning_rate": 0.0009962678350177209, - "loss": 0.0497, + "loss": 0.0472, "macro_f1": 0.3272727429866791, "num_tokens": 1443604.0, "repeat_count": 0.0, - "routers_loss": 0.04252336546778679, + "routers_loss": 0.0419243648648262, "skip_count": 1.0, "step": 896, "text_loss": 0.22092342376708984 @@ -8524,18 +8524,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.216025829175227, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.1015625, "learning_rate": 0.0009962299929472268, - "loss": 0.0349, - "macro_f1": 0.31446540355682373, + "loss": 0.034, + "macro_f1": 0.32098764181137085, "num_tokens": 1446257.0, "repeat_count": 2.0, - "routers_loss": 0.126711905002594, + "routers_loss": 0.10849297791719437, "skip_count": 0.0, "step": 898, "text_loss": 0.26394811272621155 @@ -8548,13 +8548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.10205078125, "learning_rate": 0.000996191960720463, - "loss": 0.0392, + "loss": 0.0394, "macro_f1": 0.3333333432674408, "num_tokens": 1449669.0, "repeat_count": 0.0, - "routers_loss": 0.00955706462264061, + "routers_loss": 0.0092767970636487, "skip_count": 0.0, "step": 900, "text_loss": 0.5338577628135681 @@ -8567,13 +8567,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.154296875, "learning_rate": 0.0009961537383520042, - "loss": 0.0377, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1452450.0, "repeat_count": 1.0, - "routers_loss": 0.03127318620681763, + "routers_loss": 0.02985367365181446, "skip_count": 0.0, "step": 902, "text_loss": 0.5875228047370911 @@ -8586,13 +8586,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10205078125, "learning_rate": 0.0009961153258564966, - "loss": 0.0389, + "loss": 0.0378, "macro_f1": 0.3144654333591461, "num_tokens": 1456909.0, "repeat_count": 0.0, - "routers_loss": 0.06743519753217697, + "routers_loss": 0.06794842332601547, "skip_count": 3.0, "step": 904, "text_loss": 0.40959444642066956 @@ -8605,13 +8605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009960767232486604, - "loss": 0.0477, + "loss": 0.0476, "macro_f1": 0.3333333432674408, "num_tokens": 1461712.0, "repeat_count": 0.0, - "routers_loss": 0.0025313226506114006, + "routers_loss": 0.0023562447167932987, "skip_count": 0.0, "step": 906, "text_loss": 0.3932875096797943 @@ -8624,13 +8624,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.08203125, "learning_rate": 0.000996037930543288, - "loss": 0.052, + "loss": 0.0505, "macro_f1": 0.3272727429866791, "num_tokens": 1464817.0, "repeat_count": 0.0, - "routers_loss": 0.037147488445043564, + "routers_loss": 0.03880339860916138, "skip_count": 1.0, "step": 908, "text_loss": 0.17482402920722961 @@ -8643,13 +8643,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.234375, + "grad_norm": 0.2119140625, "learning_rate": 0.000995998947755245, - "loss": 0.0501, + "loss": 0.0479, "macro_f1": 0.3272727429866791, "num_tokens": 1467810.0, "repeat_count": 0.0, - "routers_loss": 0.021232586354017258, + "routers_loss": 0.01736828312277794, "skip_count": 1.0, "step": 910, "text_loss": 0.4140470325946808 @@ -8662,13 +8662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.169921875, "learning_rate": 0.0009959597748994695, - "loss": 0.0759, + "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1470802.0, "repeat_count": 0.0, - "routers_loss": 0.010563847608864307, + "routers_loss": 0.011824851855635643, "skip_count": 0.0, "step": 912, "text_loss": 0.7153383493423462 @@ -8681,13 +8681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1455078125, "learning_rate": 0.0009959204119909726, - "loss": 0.0425, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1474539.0, "repeat_count": 0.0, - "routers_loss": 0.0267612524330616, + "routers_loss": 0.025456594303250313, "skip_count": 0.0, "step": 914, "text_loss": 0.42812058329582214 @@ -8700,13 +8700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.142578125, "learning_rate": 0.0009958808590448385, - "loss": 0.0501, + "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 1477552.0, "repeat_count": 0.0, - "routers_loss": 0.005838244222104549, + "routers_loss": 0.006795851048082113, "skip_count": 0.0, "step": 916, "text_loss": 0.5402814149856567 @@ -8719,13 +8719,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009958411160762234, - "loss": 0.0383, + "loss": 0.039, "macro_f1": 0.3333333432674408, "num_tokens": 1482547.0, "repeat_count": 0.0, - "routers_loss": 0.014642171561717987, + "routers_loss": 0.015615932643413544, "skip_count": 0.0, "step": 918, "text_loss": 0.3836168050765991 @@ -8738,32 +8738,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08984375, "learning_rate": 0.0009958011831003577, - "loss": 0.0457, + "loss": 0.0448, "macro_f1": 0.3272727429866791, "num_tokens": 1485807.0, "repeat_count": 0.0, - "routers_loss": 0.04119620472192764, + "routers_loss": 0.043541423976421356, "skip_count": 1.0, "step": 920, "text_loss": 0.4333936274051666 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.328734957440563, - "f1_execute": 0.943396270275116, - "f1_repeat": 0.0, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.1337890625, "learning_rate": 0.000995761060132543, - "loss": 0.0433, - "macro_f1": 0.3144654333591461, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, "num_tokens": 1488941.0, "repeat_count": 1.0, - "routers_loss": 0.06713195145130157, + "routers_loss": 0.05866432189941406, "skip_count": 2.0, "step": 922, "text_loss": 0.4106994867324829 @@ -8776,13 +8776,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1572265625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009957207471881552, - "loss": 0.0533, + "loss": 0.0531, "macro_f1": 0.5492662787437439, "num_tokens": 1492026.0, "repeat_count": 0.0, - "routers_loss": 0.024023180827498436, + "routers_loss": 0.02714901603758335, "skip_count": 2.0, "step": 924, "text_loss": 0.542091429233551 @@ -8795,13 +8795,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1796875, "learning_rate": 0.0009956802442826415, - "loss": 0.0373, + "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 1494543.0, "repeat_count": 1.0, - "routers_loss": 0.05399841442704201, + "routers_loss": 0.0563737191259861, "skip_count": 0.0, "step": 926, "text_loss": 0.47209203243255615 @@ -8814,13 +8814,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1259765625, "learning_rate": 0.0009956395514315235, - "loss": 0.0488, + "loss": 0.0496, "macro_f1": 0.3272727429866791, "num_tokens": 1497831.0, "repeat_count": 1.0, - "routers_loss": 0.0299264844506979, + "routers_loss": 0.03285066783428192, "skip_count": 0.0, "step": 928, "text_loss": 0.6628931164741516 @@ -8833,13 +8833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009955986686503943, - "loss": 0.0467, + "loss": 0.0466, "macro_f1": 0.3272727429866791, "num_tokens": 1501375.0, "repeat_count": 0.0, - "routers_loss": 0.023478010669350624, + "routers_loss": 0.024297121912240982, "skip_count": 1.0, "step": 930, "text_loss": 0.495676189661026 @@ -8852,13 +8852,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.1103515625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009955575959549202, - "loss": 0.0447, + "loss": 0.0424, "macro_f1": 0.7795917987823486, "num_tokens": 1504363.0, "repeat_count": 1.0, - "routers_loss": 0.12116194516420364, + "routers_loss": 0.12196464836597443, "skip_count": 4.0, "step": 932, "text_loss": 0.26123273372650146 @@ -8871,13 +8871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1708984375, "learning_rate": 0.0009955163333608408, - "loss": 0.053, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1507178.0, "repeat_count": 0.0, - "routers_loss": 0.011879723519086838, + "routers_loss": 0.012947078794240952, "skip_count": 0.0, "step": 934, "text_loss": 0.32552677392959595 @@ -8890,13 +8890,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.154296875, "learning_rate": 0.0009954748808839674, - "loss": 0.0373, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1509910.0, "repeat_count": 0.0, - "routers_loss": 0.009245929308235645, + "routers_loss": 0.008946365676820278, "skip_count": 0.0, "step": 936, "text_loss": 0.533141016960144 @@ -8909,13 +8909,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.140625, "learning_rate": 0.000995433238540185, - "loss": 0.0461, + "loss": 0.0466, "macro_f1": 0.6538461446762085, "num_tokens": 1512826.0, "repeat_count": 1.0, - "routers_loss": 0.032464127987623215, + "routers_loss": 0.029975678771734238, "skip_count": 1.0, "step": 938, "text_loss": 0.2953577935695648 @@ -8928,13 +8928,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009953914063454512, - "loss": 0.0515, + "loss": 0.0497, "macro_f1": 0.3144654333591461, "num_tokens": 1517230.0, "repeat_count": 1.0, - "routers_loss": 0.08835392445325851, + "routers_loss": 0.0889134630560875, "skip_count": 2.0, "step": 940, "text_loss": 0.5368834733963013 @@ -8947,13 +8947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.193359375, "learning_rate": 0.000995349384315796, - "loss": 0.0405, + "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 1519876.0, "repeat_count": 0.0, - "routers_loss": 0.014307246543467045, + "routers_loss": 0.013458753935992718, "skip_count": 0.0, "step": 942, "text_loss": 0.2005518227815628 @@ -8966,13 +8966,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1357421875, "learning_rate": 0.000995307172467322, - "loss": 0.0449, + "loss": 0.0444, "macro_f1": 0.31446540355682373, "num_tokens": 1522998.0, "repeat_count": 1.0, - "routers_loss": 0.10261563211679459, + "routers_loss": 0.08850377053022385, "skip_count": 1.0, "step": 944, "text_loss": 0.227926567196846 @@ -8985,13 +8985,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009952647708162054, - "loss": 0.0507, + "loss": 0.0503, "macro_f1": 0.3272727429866791, "num_tokens": 1527100.0, "repeat_count": 0.0, - "routers_loss": 0.03316422924399376, + "routers_loss": 0.03199794515967369, "skip_count": 1.0, "step": 946, "text_loss": 0.4859686493873596 @@ -9004,13 +9004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1748046875, "learning_rate": 0.0009952221793786942, - "loss": 0.0352, + "loss": 0.0354, "macro_f1": 0.3333333432674408, "num_tokens": 1530028.0, "repeat_count": 0.0, - "routers_loss": 0.00902469176799059, + "routers_loss": 0.006507779937237501, "skip_count": 0.0, "step": 948, "text_loss": 0.6855354905128479 @@ -9023,13 +9023,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.10986328125, "learning_rate": 0.0009951793981711097, - "loss": 0.0581, + "loss": 0.0584, "macro_f1": 0.6538461446762085, "num_tokens": 1533254.0, "repeat_count": 1.0, - "routers_loss": 0.06710167229175568, + "routers_loss": 0.06175103038549423, "skip_count": 1.0, "step": 950, "text_loss": 0.7590400576591492 @@ -9042,13 +9042,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009951364272098458, - "loss": 0.0294, + "loss": 0.0295, "macro_f1": 0.5492662787437439, "num_tokens": 1536239.0, "repeat_count": 0.0, - "routers_loss": 0.04208769276738167, + "routers_loss": 0.03773383051156998, "skip_count": 2.0, "step": 952, "text_loss": 0.669784665107727 @@ -9061,13 +9061,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1748046875, "learning_rate": 0.0009950932665113688, - "loss": 0.0505, + "loss": 0.0507, "macro_f1": 0.32098764181137085, "num_tokens": 1539682.0, "repeat_count": 0.0, - "routers_loss": 0.06530380249023438, + "routers_loss": 0.07280613481998444, "skip_count": 2.0, "step": 954, "text_loss": 0.3365570902824402 @@ -9080,13 +9080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.12255859375, "learning_rate": 0.0009950499160922184, - "loss": 0.0545, + "loss": 0.0541, "macro_f1": 0.3333333432674408, "num_tokens": 1542875.0, "repeat_count": 0.0, - "routers_loss": 0.01803453080356121, + "routers_loss": 0.01770266517996788, "skip_count": 0.0, "step": 956, "text_loss": 0.0921545997262001 @@ -9099,13 +9099,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.09375, "learning_rate": 0.000995006375969006, - "loss": 0.0481, + "loss": 0.0473, "macro_f1": 0.3272727429866791, "num_tokens": 1547135.0, "repeat_count": 1.0, - "routers_loss": 0.08461762219667435, + "routers_loss": 0.07672002166509628, "skip_count": 0.0, "step": 958, "text_loss": 0.5887606739997864 @@ -9120,11 +9120,11 @@ "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009949626461584165, - "loss": 0.0441, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 1550100.0, "repeat_count": 0.0, - "routers_loss": 0.007111486047506332, + "routers_loss": 0.006247182376682758, "skip_count": 0.0, "step": 960, "text_loss": 0.5777931213378906 @@ -9137,13 +9137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11181640625, + "grad_norm": 0.119140625, "learning_rate": 0.0009949187266772076, - "loss": 0.0361, + "loss": 0.0366, "macro_f1": 0.5492662787437439, "num_tokens": 1553192.0, "repeat_count": 0.0, - "routers_loss": 0.029776185750961304, + "routers_loss": 0.030319908633828163, "skip_count": 2.0, "step": 962, "text_loss": 0.2370252162218094 @@ -9156,13 +9156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009948746175422088, - "loss": 0.0506, + "loss": 0.0511, "macro_f1": 0.3333333432674408, "num_tokens": 1556318.0, "repeat_count": 0.0, - "routers_loss": 0.007108999416232109, + "routers_loss": 0.006004320923238993, "skip_count": 0.0, "step": 964, "text_loss": 0.6271032094955444 @@ -9175,13 +9175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.15234375, "learning_rate": 0.000994830318770323, - "loss": 0.0498, + "loss": 0.0514, "macro_f1": 0.3333333432674408, "num_tokens": 1559195.0, "repeat_count": 0.0, - "routers_loss": 0.01126947533339262, + "routers_loss": 0.011544366367161274, "skip_count": 0.0, "step": 966, "text_loss": 0.47256720066070557 @@ -9194,13 +9194,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.162109375, + "grad_norm": 0.171875, "learning_rate": 0.0009947858303785255, - "loss": 0.0366, + "loss": 0.0374, "macro_f1": 0.6603773832321167, "num_tokens": 1561813.0, "repeat_count": 1.0, - "routers_loss": 0.05142999067902565, + "routers_loss": 0.05258861929178238, "skip_count": 1.0, "step": 968, "text_loss": 0.7703132629394531 @@ -9213,13 +9213,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1142578125, "learning_rate": 0.0009947411523838648, - "loss": 0.0461, + "loss": 0.0453, "macro_f1": 0.3333333432674408, "num_tokens": 1564634.0, "repeat_count": 0.0, - "routers_loss": 0.010770819149911404, + "routers_loss": 0.011216280050575733, "skip_count": 0.0, "step": 970, "text_loss": 0.4666804075241089 @@ -9232,13 +9232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1533203125, "learning_rate": 0.0009946962848034608, - "loss": 0.0692, + "loss": 0.0696, "macro_f1": 0.3333333432674408, "num_tokens": 1567959.0, "repeat_count": 0.0, - "routers_loss": 0.008775795809924603, + "routers_loss": 0.009387624450027943, "skip_count": 0.0, "step": 972, "text_loss": 0.4067264199256897 @@ -9251,13 +9251,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.203125, "learning_rate": 0.0009946512276545075, - "loss": 0.0403, + "loss": 0.0397, "macro_f1": 0.3272727429866791, "num_tokens": 1571221.0, "repeat_count": 1.0, - "routers_loss": 0.05100395902991295, + "routers_loss": 0.041713520884513855, "skip_count": 0.0, "step": 974, "text_loss": 0.5242366194725037 @@ -9270,13 +9270,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.25390625, + "grad_norm": 0.228515625, "learning_rate": 0.0009946059809542705, - "loss": 0.0503, + "loss": 0.0487, "macro_f1": 0.7644445300102234, "num_tokens": 1575033.0, "repeat_count": 2.0, - "routers_loss": 0.06653711199760437, + "routers_loss": 0.05748331546783447, "skip_count": 2.0, "step": 976, "text_loss": 0.5704690217971802 @@ -9284,18 +9284,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.591722923393014, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.1396484375, "learning_rate": 0.0009945605447200887, - "loss": 0.0435, - "macro_f1": 0.3333333432674408, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, "num_tokens": 1579050.0, "repeat_count": 0.0, - "routers_loss": 0.009865665808320045, + "routers_loss": 0.016765203326940536, "skip_count": 0.0, "step": 978, "text_loss": 0.4804173707962036 @@ -9308,13 +9308,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.130859375, + "grad_norm": 0.1337890625, "learning_rate": 0.0009945149189693732, - "loss": 0.0399, + "loss": 0.0406, "macro_f1": 0.5492662787437439, "num_tokens": 1582967.0, "repeat_count": 0.0, - "routers_loss": 0.021175632253289223, + "routers_loss": 0.021518222987651825, "skip_count": 2.0, "step": 980, "text_loss": 0.4138598144054413 @@ -9327,32 +9327,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11181640625, + "grad_norm": 0.11474609375, "learning_rate": 0.0009944691037196078, - "loss": 0.0472, + "loss": 0.0456, "macro_f1": 0.3333333432674408, "num_tokens": 1586282.0, "repeat_count": 0.0, - "routers_loss": 0.011803832836449146, + "routers_loss": 0.012246460653841496, "skip_count": 0.0, "step": 982, "text_loss": 0.22561736404895782 }, { - "acc_repeat": 0.0, + "acc_repeat": 0.5, "acc_skip": 0.800000011920929, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 4.6199002054593485, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.0, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.8000000715255737, - "grad_norm": 0.142578125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009944230989883491, - "loss": 0.0467, - "macro_f1": 0.5696970224380493, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, "num_tokens": 1589279.0, "repeat_count": 2.0, - "routers_loss": 0.08856551349163055, + "routers_loss": 0.09344895929098129, "skip_count": 5.0, "step": 984, "text_loss": 0.4416656494140625 @@ -9365,13 +9365,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.111328125, "learning_rate": 0.0009943769047932264, - "loss": 0.0413, + "loss": 0.0404, "macro_f1": 0.5359477400779724, "num_tokens": 1592398.0, "repeat_count": 2.0, - "routers_loss": 0.08593414723873138, + "routers_loss": 0.08916857838630676, "skip_count": 2.0, "step": 986, "text_loss": 0.5536438822746277 @@ -9384,13 +9384,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.154296875, + "grad_norm": 0.15234375, "learning_rate": 0.000994330521151941, - "loss": 0.0399, + "loss": 0.039, "macro_f1": 0.32098764181137085, "num_tokens": 1596213.0, "repeat_count": 1.0, - "routers_loss": 0.07049509882926941, + "routers_loss": 0.06114347651600838, "skip_count": 1.0, "step": 988, "text_loss": 0.5835405588150024 @@ -9403,13 +9403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.205078125, + "grad_norm": 0.1953125, "learning_rate": 0.000994283948082267, - "loss": 0.0595, + "loss": 0.0573, "macro_f1": 0.3333333432674408, "num_tokens": 1598827.0, "repeat_count": 0.0, - "routers_loss": 0.0019258069805800915, + "routers_loss": 0.0017335431184619665, "skip_count": 0.0, "step": 990, "text_loss": 0.5857380032539368 @@ -9422,13 +9422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10693359375, "learning_rate": 0.0009942371856020522, - "loss": 0.0335, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1602915.0, "repeat_count": 0.0, - "routers_loss": 0.014094089157879353, + "routers_loss": 0.014606470242142677, "skip_count": 0.0, "step": 992, "text_loss": 0.6939892768859863 @@ -9436,18 +9436,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 31.0, "epoch": 4.666862342236572, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.140625, "learning_rate": 0.0009941902337292155, - "loss": 0.0603, - "macro_f1": 0.6527777910232544, + "loss": 0.06, + "macro_f1": 0.6598639488220215, "num_tokens": 1605776.0, "repeat_count": 3.0, - "routers_loss": 0.06360147893428802, + "routers_loss": 0.06297315657138824, "skip_count": 1.0, "step": 994, "text_loss": 0.37616831064224243 @@ -9460,13 +9460,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10546875, + "grad_norm": 0.1083984375, "learning_rate": 0.0009941430924817487, - "loss": 0.0573, + "loss": 0.0572, "macro_f1": 0.5492662787437439, "num_tokens": 1609856.0, "repeat_count": 0.0, - "routers_loss": 0.0326208658516407, + "routers_loss": 0.03297794610261917, "skip_count": 2.0, "step": 996, "text_loss": 0.2098303586244583 @@ -9479,13 +9479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.10107421875, "learning_rate": 0.000994095761877717, - "loss": 0.0502, + "loss": 0.0499, "macro_f1": 0.3333333432674408, "num_tokens": 1612904.0, "repeat_count": 0.0, - "routers_loss": 0.012660752050578594, + "routers_loss": 0.012901155278086662, "skip_count": 0.0, "step": 998, "text_loss": 0.20103533565998077 @@ -9498,13 +9498,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.259765625, "learning_rate": 0.000994048241935257, - "loss": 0.0537, + "loss": 0.0535, "macro_f1": 0.3272727429866791, "num_tokens": 1615540.0, "repeat_count": 0.0, - "routers_loss": 0.021756287664175034, + "routers_loss": 0.020434845238924026, "skip_count": 0.0, "step": 1000, "text_loss": 0.32709044218063354 @@ -9512,37 +9512,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.70443205165835, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1669921875, "learning_rate": 0.0009940005326725789, - "loss": 0.0447, - "macro_f1": 0.31446540355682373, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, "num_tokens": 1618786.0, "repeat_count": 0.0, - "routers_loss": 0.07292548567056656, + "routers_loss": 0.07831378281116486, "skip_count": 2.0, "step": 1002, "text_loss": 0.5789632797241211 }, { - "acc_repeat": 0.5, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 4.713824479013795, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1787109375, + "grad_norm": 0.21875, "learning_rate": 0.0009939526341079647, - "loss": 0.0505, - "macro_f1": 0.5492662787437439, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, "num_tokens": 1621736.0, "repeat_count": 2.0, - "routers_loss": 0.03397528454661369, + "routers_loss": 0.04863874986767769, "skip_count": 0.0, "step": 1004, "text_loss": 0.6128849387168884 @@ -9555,13 +9555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1435546875, "learning_rate": 0.0009939045462597693, - "loss": 0.0544, + "loss": 0.0538, "macro_f1": 0.3333333432674408, "num_tokens": 1624649.0, "repeat_count": 0.0, - "routers_loss": 0.005987613927572966, + "routers_loss": 0.00677989237010479, "skip_count": 0.0, "step": 1006, "text_loss": 0.6168264150619507 @@ -9574,13 +9574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009938562691464202, - "loss": 0.0522, + "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 1627700.0, "repeat_count": 0.0, - "routers_loss": 0.021656684577465057, + "routers_loss": 0.019490402191877365, "skip_count": 0.0, "step": 1008, "text_loss": 0.17463822662830353 @@ -9593,32 +9593,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1298828125, "learning_rate": 0.000993807802786417, - "loss": 0.0487, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1630714.0, "repeat_count": 0.0, - "routers_loss": 0.0014992234064266086, + "routers_loss": 0.0019022391643375158, "skip_count": 0.0, "step": 1010, "text_loss": 0.5675593018531799 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 4.751394188435574, - "f1_execute": 0.9411764740943909, - "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, "learning_rate": 0.0009937591471983322, - "loss": 0.0491, - "macro_f1": 0.5359477400779724, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, "num_tokens": 1633770.0, "repeat_count": 1.0, - "routers_loss": 0.03448791801929474, + "routers_loss": 0.042485643178224564, "skip_count": 2.0, "step": 1012, "text_loss": 0.42387229204177856 @@ -9631,13 +9631,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1396484375, "learning_rate": 0.0009937103024008109, - "loss": 0.0541, + "loss": 0.0545, "macro_f1": 0.3272727429866791, "num_tokens": 1637120.0, "repeat_count": 0.0, - "routers_loss": 0.08285929262638092, + "routers_loss": 0.09427817165851593, "skip_count": 1.0, "step": 1014, "text_loss": 0.49511051177978516 @@ -9650,13 +9650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.125, + "grad_norm": 0.12890625, "learning_rate": 0.0009936612684125702, - "loss": 0.0515, + "loss": 0.0503, "macro_f1": 0.3333333432674408, "num_tokens": 1640165.0, "repeat_count": 0.0, - "routers_loss": 0.00486504752188921, + "routers_loss": 0.005106127820909023, "skip_count": 0.0, "step": 1016, "text_loss": 0.5398799180984497 @@ -9669,13 +9669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.275390625, + "grad_norm": 0.2734375, "learning_rate": 0.0009936120452524004, - "loss": 0.051, + "loss": 0.0506, "macro_f1": 0.3333333432674408, "num_tokens": 1643251.0, "repeat_count": 0.0, - "routers_loss": 0.017805909737944603, + "routers_loss": 0.016914300620555878, "skip_count": 0.0, "step": 1018, "text_loss": 0.20882178843021393 @@ -9688,13 +9688,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1865234375, + "grad_norm": 0.1962890625, "learning_rate": 0.0009935626329391637, - "loss": 0.0547, + "loss": 0.0537, "macro_f1": 0.32098764181137085, "num_tokens": 1646560.0, "repeat_count": 0.0, - "routers_loss": 0.12958799302577972, + "routers_loss": 0.13481520116329193, "skip_count": 2.0, "step": 1020, "text_loss": 0.5719883441925049 @@ -9707,13 +9707,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1533203125, + "grad_norm": 0.1552734375, "learning_rate": 0.0009935130314917948, - "loss": 0.0595, + "loss": 0.0602, "macro_f1": 0.5492662787437439, "num_tokens": 1649538.0, "repeat_count": 0.0, - "routers_loss": 0.07447081059217453, + "routers_loss": 0.07700438797473907, "skip_count": 2.0, "step": 1022, "text_loss": 0.1303367167711258 @@ -9726,13 +9726,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009934632409293015, - "loss": 0.0619, + "loss": 0.0611, "macro_f1": 0.32098764181137085, "num_tokens": 1652397.0, "repeat_count": 1.0, - "routers_loss": 0.12529553472995758, + "routers_loss": 0.11416907608509064, "skip_count": 1.0, "step": 1024, "text_loss": 0.24076920747756958 @@ -9745,13 +9745,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.279296875, + "grad_norm": 0.306640625, "learning_rate": 0.0009934132612707631, - "loss": 0.0491, + "loss": 0.0507, "macro_f1": 0.31446540355682373, "num_tokens": 1654938.0, "repeat_count": 0.0, - "routers_loss": 0.08664281666278839, + "routers_loss": 0.09484589844942093, "skip_count": 2.0, "step": 1026, "text_loss": 0.1652517318725586 @@ -9764,13 +9764,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1435546875, "learning_rate": 0.0009933630925353324, - "loss": 0.0394, + "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1658536.0, "repeat_count": 0.0, - "routers_loss": 0.0067965323105454445, + "routers_loss": 0.00741987070068717, "skip_count": 0.0, "step": 1028, "text_loss": 0.49296700954437256 @@ -9783,13 +9783,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1845703125, "learning_rate": 0.0009933127347422337, - "loss": 0.0607, + "loss": 0.0602, "macro_f1": 0.32098764181137085, "num_tokens": 1661446.0, "repeat_count": 0.0, - "routers_loss": 0.08319470286369324, + "routers_loss": 0.08399344235658646, "skip_count": 2.0, "step": 1030, "text_loss": 0.22363591194152832 @@ -9802,13 +9802,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.158203125, "learning_rate": 0.0009932621879107648, - "loss": 0.0476, + "loss": 0.0475, "macro_f1": 0.3333333432674408, "num_tokens": 1664612.0, "repeat_count": 0.0, - "routers_loss": 0.002826537238433957, + "routers_loss": 0.0031781597062945366, "skip_count": 0.0, "step": 1032, "text_loss": 0.36083245277404785 @@ -9823,11 +9823,11 @@ "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.000993211452060295, - "loss": 0.0431, + "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 1667467.0, "repeat_count": 0.0, - "routers_loss": 0.03491095453500748, + "routers_loss": 0.03595469892024994, "skip_count": 1.0, "step": 1034, "text_loss": 0.16372856497764587 @@ -9840,13 +9840,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.189453125, "learning_rate": 0.000993160527210266, - "loss": 0.0616, + "loss": 0.061, "macro_f1": 0.3144654333591461, "num_tokens": 1670675.0, "repeat_count": 3.0, - "routers_loss": 0.1828247457742691, + "routers_loss": 0.1597205102443695, "skip_count": 0.0, "step": 1036, "text_loss": 0.6049913763999939 @@ -9859,13 +9859,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2099609375, + "grad_norm": 0.2197265625, "learning_rate": 0.000993109413380193, - "loss": 0.0563, + "loss": 0.0562, "macro_f1": 0.3333333432674408, "num_tokens": 1673477.0, "repeat_count": 0.0, - "routers_loss": 0.010931054130196571, + "routers_loss": 0.009756010957062244, "skip_count": 0.0, "step": 1038, "text_loss": 0.7034620642662048 @@ -9878,13 +9878,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1806640625, "learning_rate": 0.0009930581105896624, - "loss": 0.0569, + "loss": 0.0559, "macro_f1": 0.3272727429866791, "num_tokens": 1676809.0, "repeat_count": 0.0, - "routers_loss": 0.023222090676426888, + "routers_loss": 0.020718922838568687, "skip_count": 0.0, "step": 1040, "text_loss": 0.2814720571041107 @@ -9897,13 +9897,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1962890625, + "grad_norm": 0.1923828125, "learning_rate": 0.0009930066188583338, - "loss": 0.0453, + "loss": 0.0445, "macro_f1": 0.32098764181137085, "num_tokens": 1679398.0, "repeat_count": 1.0, - "routers_loss": 0.07085686922073364, + "routers_loss": 0.04755603149533272, "skip_count": 1.0, "step": 1042, "text_loss": 0.5445759296417236 @@ -9916,13 +9916,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.126953125, "learning_rate": 0.0009929549382059388, - "loss": 0.0515, + "loss": 0.0509, "macro_f1": 0.3333333432674408, "num_tokens": 1682269.0, "repeat_count": 0.0, - "routers_loss": 0.010158216580748558, + "routers_loss": 0.01040949858725071, "skip_count": 0.0, "step": 1044, "text_loss": 0.2876914143562317 @@ -9935,13 +9935,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009929030686522816, - "loss": 0.0372, + "loss": 0.0363, "macro_f1": 0.3333333432674408, "num_tokens": 1685428.0, "repeat_count": 0.0, - "routers_loss": 0.007876895368099213, + "routers_loss": 0.008158888667821884, "skip_count": 0.0, "step": 1046, "text_loss": 0.49053525924682617 @@ -9954,13 +9954,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009928510102172386, - "loss": 0.0501, + "loss": 0.0498, "macro_f1": 0.3333333432674408, "num_tokens": 1688252.0, "repeat_count": 0.0, - "routers_loss": 0.004859173204749823, + "routers_loss": 0.005102572031319141, "skip_count": 0.0, "step": 1048, "text_loss": 0.5274341106414795 @@ -9973,13 +9973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.17578125, + "grad_norm": 0.1591796875, "learning_rate": 0.0009927987629207587, - "loss": 0.0582, + "loss": 0.0564, "macro_f1": 0.3333333432674408, "num_tokens": 1691289.0, "repeat_count": 0.0, - "routers_loss": 0.01798083633184433, + "routers_loss": 0.016768503934144974, "skip_count": 0.0, "step": 1050, "text_loss": 0.9935035109519958 @@ -9987,18 +9987,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 4.939242735544467, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1376953125, + "grad_norm": 0.1455078125, "learning_rate": 0.0009927463267828634, "loss": 0.0488, - "macro_f1": 0.3272727429866791, + "macro_f1": 0.3333333432674408, "num_tokens": 1694148.0, "repeat_count": 0.0, - "routers_loss": 0.014295363798737526, + "routers_loss": 0.010905829258263111, "skip_count": 0.0, "step": 1052, "text_loss": 0.20895758271217346 @@ -10011,13 +10011,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1455078125, "learning_rate": 0.000992693701823646, - "loss": 0.0635, + "loss": 0.0624, "macro_f1": 0.3272727429866791, "num_tokens": 1698543.0, "repeat_count": 1.0, - "routers_loss": 0.1038367822766304, + "routers_loss": 0.10533971339464188, "skip_count": 0.0, "step": 1054, "text_loss": 0.5776236653327942 @@ -10030,13 +10030,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2421875, + "grad_norm": 0.255859375, "learning_rate": 0.0009926408880632726, - "loss": 0.057, + "loss": 0.0556, "macro_f1": 0.3272727429866791, "num_tokens": 1702460.0, "repeat_count": 0.0, - "routers_loss": 0.029780643060803413, + "routers_loss": 0.026313411071896553, "skip_count": 1.0, "step": 1056, "text_loss": 0.34990596771240234 @@ -10049,13 +10049,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.099609375, "learning_rate": 0.0009925878855219818, - "loss": 0.0398, + "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 1705686.0, "repeat_count": 0.0, - "routers_loss": 0.008537676185369492, + "routers_loss": 0.007763393223285675, "skip_count": 0.0, "step": 1058, "text_loss": 0.4980163276195526 @@ -10068,13 +10068,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.177734375, "learning_rate": 0.000992534694220084, - "loss": 0.0617, + "loss": 0.0613, "macro_f1": 0.3272727429866791, "num_tokens": 1708739.0, "repeat_count": 0.0, - "routers_loss": 0.03966755419969559, + "routers_loss": 0.03998444974422455, "skip_count": 1.0, "step": 1060, "text_loss": 0.29092350602149963 @@ -10087,13 +10087,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1484375, + "grad_norm": 0.1572265625, "learning_rate": 0.000992481314177962, - "loss": 0.0311, + "loss": 0.0312, "macro_f1": 0.32098764181137085, "num_tokens": 1711903.0, "repeat_count": 1.0, - "routers_loss": 0.06651833653450012, + "routers_loss": 0.06966045498847961, "skip_count": 1.0, "step": 1062, "text_loss": 0.6267179250717163 @@ -10106,13 +10106,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2431640625, + "grad_norm": 0.244140625, "learning_rate": 0.0009924277454160717, - "loss": 0.0557, + "loss": 0.0548, "macro_f1": 0.3272727429866791, "num_tokens": 1715974.0, "repeat_count": 0.0, - "routers_loss": 0.05130369961261749, + "routers_loss": 0.05536063387989998, "skip_count": 1.0, "step": 1064, "text_loss": 0.5813798904418945 @@ -10125,13 +10125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1337890625, + "grad_norm": 0.134765625, "learning_rate": 0.0009923739879549402, - "loss": 0.0435, + "loss": 0.0423, "macro_f1": 0.3333333432674408, "num_tokens": 1718828.0, "repeat_count": 0.0, - "routers_loss": 0.020534176379442215, + "routers_loss": 0.020993782207369804, "skip_count": 0.0, "step": 1066, "text_loss": 0.22665327787399292 @@ -10144,13 +10144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009923200418151677, - "loss": 0.0305, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1722419.0, "repeat_count": 0.0, - "routers_loss": 0.007514918688684702, + "routers_loss": 0.007351701147854328, "skip_count": 0.0, "step": 1068, "text_loss": 0.5796169638633728 @@ -10163,13 +10163,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.142578125, "learning_rate": 0.0009922659070174264, - "loss": 0.0461, + "loss": 0.0452, "macro_f1": 0.3272727429866791, "num_tokens": 1725663.0, "repeat_count": 1.0, - "routers_loss": 0.024598751217126846, + "routers_loss": 0.026033315807580948, "skip_count": 0.0, "step": 1070, "text_loss": 0.25742828845977783 @@ -10182,32 +10182,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009922115835824612, - "loss": 0.0408, + "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 1729239.0, "repeat_count": 0.0, - "routers_loss": 0.011866633780300617, + "routers_loss": 0.0118600158020854, "skip_count": 0.0, "step": 1072, "text_loss": 0.21630282700061798 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.042265923099501, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009921570715310884, - "loss": 0.036, - "macro_f1": 0.3272727429866791, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, "num_tokens": 1732507.0, "repeat_count": 1.0, - "routers_loss": 0.01755746826529503, + "routers_loss": 0.016118815168738365, "skip_count": 0.0, "step": 1074, "text_loss": 0.5639925003051758 @@ -10220,13 +10220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0009921023708841974, - "loss": 0.0415, + "loss": 0.0407, "macro_f1": 0.3333333432674408, "num_tokens": 1736182.0, "repeat_count": 0.0, - "routers_loss": 0.003976983483880758, + "routers_loss": 0.004275390412658453, "skip_count": 0.0, "step": 1076, "text_loss": 0.5758615136146545 @@ -10239,13 +10239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009920474816627496, - "loss": 0.0378, + "loss": 0.037, "macro_f1": 0.3333333432674408, "num_tokens": 1739559.0, "repeat_count": 0.0, - "routers_loss": 0.013548235408961773, + "routers_loss": 0.01299292128533125, "skip_count": 0.0, "step": 1078, "text_loss": 0.18221625685691833 @@ -10258,13 +10258,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009919924038877788, "loss": 0.0343, "macro_f1": 0.32098764181137085, "num_tokens": 1742890.0, "repeat_count": 0.0, - "routers_loss": 0.03923165053129196, + "routers_loss": 0.038295745849609375, "skip_count": 2.0, "step": 1080, "text_loss": 0.17354349792003632 @@ -10277,13 +10277,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.1923828125, + "grad_norm": 0.1884765625, "learning_rate": 0.0009919371375803905, - "loss": 0.0464, + "loss": 0.0455, "macro_f1": 0.8194444179534912, "num_tokens": 1746433.0, "repeat_count": 2.0, - "routers_loss": 0.046429626643657684, + "routers_loss": 0.04052971675992012, "skip_count": 3.0, "step": 1082, "text_loss": 0.2250112146139145 @@ -10296,13 +10296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1025390625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009918816827617632, - "loss": 0.0346, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 1750802.0, "repeat_count": 0.0, - "routers_loss": 0.008998732082545757, + "routers_loss": 0.009114136919379234, "skip_count": 0.0, "step": 1084, "text_loss": 0.2526719272136688 @@ -10315,13 +10315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1279296875, "learning_rate": 0.000991826039453147, - "loss": 0.0386, + "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 1754272.0, "repeat_count": 0.0, - "routers_loss": 0.005173585377633572, + "routers_loss": 0.004904678091406822, "skip_count": 0.0, "step": 1086, "text_loss": 0.7308789491653442 @@ -10334,13 +10334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.138671875, "learning_rate": 0.000991770207675865, - "loss": 0.0308, + "loss": 0.0327, "macro_f1": 0.6666666865348816, "num_tokens": 1757231.0, "repeat_count": 0.0, - "routers_loss": 0.024098891764879227, + "routers_loss": 0.02129189297556877, "skip_count": 2.0, "step": 1088, "text_loss": 0.21764220297336578 @@ -10353,13 +10353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1611328125, "learning_rate": 0.0009917141874513113, "loss": 0.0315, "macro_f1": 0.3333333432674408, "num_tokens": 1760003.0, "repeat_count": 0.0, - "routers_loss": 0.014002764597535133, + "routers_loss": 0.01310618408024311, "skip_count": 0.0, "step": 1090, "text_loss": 0.33892181515693665 @@ -10372,32 +10372,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.14453125, + "grad_norm": 0.171875, "learning_rate": 0.0009916579788009537, - "loss": 0.0462, + "loss": 0.0457, "macro_f1": 0.5492662787437439, "num_tokens": 1763052.0, "repeat_count": 0.0, - "routers_loss": 0.017871137708425522, + "routers_loss": 0.02059309557080269, "skip_count": 2.0, "step": 1092, "text_loss": 0.6551769375801086 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.136190196653947, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, "learning_rate": 0.0009916015817463312, "loss": 0.0385, - "macro_f1": 0.32098764181137085, + "macro_f1": 0.5492662787437439, "num_tokens": 1766655.0, "repeat_count": 0.0, - "routers_loss": 0.033123619854450226, + "routers_loss": 0.0274797435849905, "skip_count": 2.0, "step": 1094, "text_loss": 0.3984372019767761 @@ -10410,13 +10410,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.11181640625, "learning_rate": 0.000991544996309055, - "loss": 0.0267, + "loss": 0.0271, "macro_f1": 0.3333333432674408, "num_tokens": 1769997.0, "repeat_count": 0.0, - "routers_loss": 0.01279227901250124, + "routers_loss": 0.01437368243932724, "skip_count": 0.0, "step": 1096, "text_loss": 0.4203338921070099 @@ -10429,13 +10429,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.150390625, + "grad_norm": 0.1103515625, "learning_rate": 0.000991488222510809, - "loss": 0.0295, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1773130.0, "repeat_count": 0.0, - "routers_loss": 0.001354650012217462, + "routers_loss": 0.001382062560878694, "skip_count": 0.0, "step": 1098, "text_loss": 0.43132516741752625 @@ -10448,13 +10448,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.123046875, "learning_rate": 0.000991431260373349, - "loss": 0.0326, + "loss": 0.0329, "macro_f1": 0.3144654333591461, "num_tokens": 1775682.0, "repeat_count": 1.0, - "routers_loss": 0.1097714751958847, + "routers_loss": 0.1115434318780899, "skip_count": 2.0, "step": 1100, "text_loss": 0.3218227028846741 @@ -10467,13 +10467,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.111328125, "learning_rate": 0.000991374109918503, - "loss": 0.0187, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 1778407.0, "repeat_count": 0.0, - "routers_loss": 0.009649592451751232, + "routers_loss": 0.009529678151011467, "skip_count": 0.0, "step": 1102, "text_loss": 0.17183731496334076 @@ -10486,13 +10486,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11083984375, + "grad_norm": 0.1142578125, "learning_rate": 0.000991316771168171, - "loss": 0.0447, + "loss": 0.044, "macro_f1": 0.5492662787437439, "num_tokens": 1781518.0, "repeat_count": 0.0, - "routers_loss": 0.020858706906437874, + "routers_loss": 0.018668074160814285, "skip_count": 2.0, "step": 1104, "text_loss": 1.1324785947799683 @@ -10505,13 +10505,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.134765625, + "grad_norm": 0.125, "learning_rate": 0.0009912592441443258, - "loss": 0.0428, + "loss": 0.0411, "macro_f1": 0.3272727429866791, "num_tokens": 1784878.0, "repeat_count": 0.0, - "routers_loss": 0.048101235181093216, + "routers_loss": 0.04145100712776184, "skip_count": 1.0, "step": 1106, "text_loss": 0.6082063317298889 @@ -10524,13 +10524,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.08984375, "learning_rate": 0.0009912015288690112, - "loss": 0.0435, + "loss": 0.0421, "macro_f1": 0.3272727429866791, "num_tokens": 1788978.0, "repeat_count": 0.0, - "routers_loss": 0.02875671721994877, + "routers_loss": 0.021450644358992577, "skip_count": 1.0, "step": 1108, "text_loss": 0.5597621202468872 @@ -10543,13 +10543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.083984375, "learning_rate": 0.0009911436253643444, - "loss": 0.0247, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 1792321.0, "repeat_count": 0.0, - "routers_loss": 0.019005145877599716, + "routers_loss": 0.017405325546860695, "skip_count": 0.0, "step": 1110, "text_loss": 0.2560598850250244 @@ -10562,13 +10562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.255859375, + "grad_norm": 0.2294921875, "learning_rate": 0.0009910855336525137, - "loss": 0.0393, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1795182.0, "repeat_count": 0.0, - "routers_loss": 0.007238700054585934, + "routers_loss": 0.007162237539887428, "skip_count": 0.0, "step": 1112, "text_loss": 0.3438240587711334 @@ -10581,13 +10581,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.115234375, "learning_rate": 0.00099102725375578, "loss": 0.0326, "macro_f1": 0.480392187833786, "num_tokens": 1798987.0, "repeat_count": 1.0, - "routers_loss": 0.12206140905618668, + "routers_loss": 0.11149197816848755, "skip_count": 3.0, "step": 1114, "text_loss": 0.20455503463745117 @@ -10595,18 +10595,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 5.239506897563839, - "f1_execute": 0.8799999952316284, + "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009909687856964767, - "loss": 0.0366, - "macro_f1": 0.29333335161209106, + "loss": 0.035, + "macro_f1": 0.3006536364555359, "num_tokens": 1802064.0, "repeat_count": 2.0, - "routers_loss": 0.15721899271011353, + "routers_loss": 0.12679415941238403, "skip_count": 3.0, "step": 1116, "text_loss": 0.11996729671955109 @@ -10619,32 +10619,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.125, + "grad_norm": 0.12451171875, "learning_rate": 0.0009909101294970082, - "loss": 0.0366, + "loss": 0.0365, "macro_f1": 0.5492662787437439, "num_tokens": 1805412.0, "repeat_count": 0.0, - "routers_loss": 0.05058665946125984, + "routers_loss": 0.05108053982257843, "skip_count": 2.0, "step": 1118, "text_loss": 0.13224145770072937 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 5.258291752274729, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "f1_skip": 1.0, + "grad_norm": 0.123046875, "learning_rate": 0.0009908512851798522, - "loss": 0.0454, - "macro_f1": 0.32098764181137085, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, "num_tokens": 1808196.0, "repeat_count": 1.0, - "routers_loss": 0.023021472617983818, + "routers_loss": 0.02131766639649868, "skip_count": 1.0, "step": 1120, "text_loss": 0.7824069261550903 @@ -10657,13 +10657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1435546875, + "grad_norm": 0.138671875, "learning_rate": 0.0009907922527675576, - "loss": 0.0409, + "loss": 0.0405, "macro_f1": 0.3333333432674408, "num_tokens": 1811622.0, "repeat_count": 0.0, - "routers_loss": 0.006660689599812031, + "routers_loss": 0.006226244382560253, "skip_count": 0.0, "step": 1122, "text_loss": 0.5419743061065674 @@ -10676,13 +10676,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.146484375, + "grad_norm": 0.12890625, "learning_rate": 0.000990733032282746, - "loss": 0.0547, + "loss": 0.0535, "macro_f1": 0.5492662787437439, "num_tokens": 1814628.0, "repeat_count": 0.0, - "routers_loss": 0.031727343797683716, + "routers_loss": 0.03088250942528248, "skip_count": 2.0, "step": 1124, "text_loss": 0.37100958824157715 @@ -10695,13 +10695,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0810546875, "learning_rate": 0.000990673623748111, - "loss": 0.0351, + "loss": 0.0348, "macro_f1": 0.32098767161369324, "num_tokens": 1817205.0, "repeat_count": 0.0, - "routers_loss": 0.06140992045402527, + "routers_loss": 0.05495348572731018, "skip_count": 1.0, "step": 1126, "text_loss": 0.20241330564022064 @@ -10709,18 +10709,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 25.0, + "avg_layers": 26.0, "epoch": 5.295861461696507, - "f1_execute": 0.9411764740943909, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, "learning_rate": 0.0009906140271864173, - "loss": 0.0436, - "macro_f1": 0.44705885648727417, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, "num_tokens": 1820141.0, "repeat_count": 0.0, - "routers_loss": 0.03872275352478027, + "routers_loss": 0.037809282541275024, "skip_count": 2.0, "step": 1128, "text_loss": 0.32965806126594543 @@ -10728,18 +10728,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.305253889051952, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009905542426205032, - "loss": 0.0353, - "macro_f1": 0.3272727429866791, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, "num_tokens": 1824011.0, "repeat_count": 0.0, - "routers_loss": 0.031013142317533493, + "routers_loss": 0.03320181369781494, "skip_count": 1.0, "step": 1130, "text_loss": 0.36329755187034607 @@ -10752,13 +10752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10595703125, "learning_rate": 0.0009904942700732777, - "loss": 0.0333, + "loss": 0.0335, "macro_f1": 0.3333333432674408, "num_tokens": 1826873.0, "repeat_count": 0.0, - "routers_loss": 0.004357635974884033, + "routers_loss": 0.004102326463907957, "skip_count": 0.0, "step": 1132, "text_loss": 0.6692602038383484 @@ -10771,13 +10771,13 @@ "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11279296875, + "grad_norm": 0.08544921875, "learning_rate": 0.0009904341095677226, "loss": 0.03, "macro_f1": 0.29333335161209106, "num_tokens": 1830103.0, "repeat_count": 2.0, - "routers_loss": 0.2376353144645691, + "routers_loss": 0.2376193106174469, "skip_count": 4.0, "step": 1134, "text_loss": 0.19212862849235535 @@ -10790,13 +10790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.119140625, "learning_rate": 0.0009903737611268919, - "loss": 0.0446, + "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1833201.0, "repeat_count": 0.0, - "routers_loss": 0.004978097043931484, + "routers_loss": 0.005253395065665245, "skip_count": 0.0, "step": 1136, "text_loss": 0.6773360371589661 @@ -10809,13 +10809,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009903132247739107, - "loss": 0.0309, + "loss": 0.0305, "macro_f1": 0.3076923191547394, "num_tokens": 1836045.0, "repeat_count": 1.0, - "routers_loss": 0.14195409417152405, + "routers_loss": 0.14382585883140564, "skip_count": 3.0, "step": 1138, "text_loss": 0.2882297933101654 @@ -10828,13 +10828,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.15234375, + "grad_norm": 0.150390625, "learning_rate": 0.0009902525005319766, - "loss": 0.0403, + "loss": 0.04, "macro_f1": 0.5427350401878357, "num_tokens": 1839721.0, "repeat_count": 1.0, - "routers_loss": 0.04005253314971924, + "routers_loss": 0.04033960774540901, "skip_count": 2.0, "step": 1140, "text_loss": 0.7172559499740601 @@ -10847,13 +10847,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.12109375, "learning_rate": 0.0009901915884243597, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 1842614.0, "repeat_count": 1.0, - "routers_loss": 0.006839688867330551, + "routers_loss": 0.005162308923900127, "skip_count": 0.0, "step": 1142, "text_loss": 0.42892804741859436 @@ -10866,13 +10866,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009901304884744014, - "loss": 0.0396, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1845444.0, "repeat_count": 1.0, - "routers_loss": 0.10174567997455597, + "routers_loss": 0.10117656737565994, "skip_count": 2.0, "step": 1144, "text_loss": 0.20806430280208588 @@ -10885,13 +10885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.130859375, "learning_rate": 0.0009900692007055152, - "loss": 0.0365, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 1848558.0, "repeat_count": 0.0, - "routers_loss": 0.014655748382210732, + "routers_loss": 0.014107038266956806, "skip_count": 0.0, "step": 1146, "text_loss": 0.5355974435806274 @@ -10904,13 +10904,13 @@ "f1_execute": 0.9166666865348816, "f1_repeat": 0.4000000059604645, "f1_skip": 0.6666666865348816, - "grad_norm": 0.158203125, + "grad_norm": 0.16015625, "learning_rate": 0.000990007725141187, - "loss": 0.0467, + "loss": 0.0449, "macro_f1": 0.6611111164093018, "num_tokens": 1852723.0, "repeat_count": 4.0, - "routers_loss": 0.16960746049880981, + "routers_loss": 0.15537866950035095, "skip_count": 2.0, "step": 1148, "text_loss": 0.6388513445854187 @@ -10923,32 +10923,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.1181640625, "learning_rate": 0.0009899460618049741, - "loss": 0.0399, + "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1856181.0, "repeat_count": 0.0, - "routers_loss": 0.011591178365051746, + "routers_loss": 0.011800912208855152, "skip_count": 0.0, "step": 1150, "text_loss": 0.6113069653511047 }, { - "acc_repeat": 0.5, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 30.0, "epoch": 5.408570589961843, - "f1_execute": 0.9811320900917053, - "f1_repeat": 0.6666666865348816, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.1005859375, "learning_rate": 0.000989884210720506, - "loss": 0.0332, - "macro_f1": 0.5492662787437439, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, "num_tokens": 1859685.0, "repeat_count": 2.0, - "routers_loss": 0.04036068916320801, + "routers_loss": 0.022900646552443504, "skip_count": 0.0, "step": 1152, "text_loss": 0.25718021392822266 @@ -10961,13 +10961,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12353515625, + "grad_norm": 0.10595703125, "learning_rate": 0.0009898221719114844, - "loss": 0.0366, + "loss": 0.0354, "macro_f1": 0.3272727429866791, "num_tokens": 1862505.0, "repeat_count": 0.0, - "routers_loss": 0.030165785923600197, + "routers_loss": 0.026814989745616913, "skip_count": 1.0, "step": 1154, "text_loss": 0.5426549911499023 @@ -10980,13 +10980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1015625, "learning_rate": 0.0009897599454016823, - "loss": 0.0421, + "loss": 0.0401, "macro_f1": 0.3333333432674408, "num_tokens": 1866266.0, "repeat_count": 0.0, - "routers_loss": 0.003615695284679532, + "routers_loss": 0.0032623792067170143, "skip_count": 0.0, "step": 1156, "text_loss": 0.37752896547317505 @@ -10999,13 +10999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07080078125, "learning_rate": 0.0009896975312149454, - "loss": 0.0377, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1870216.0, "repeat_count": 0.0, - "routers_loss": 0.01679840311408043, + "routers_loss": 0.015617577359080315, "skip_count": 0.0, "step": 1158, "text_loss": 0.18207129836082458 @@ -11018,13 +11018,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009896349293751906, - "loss": 0.0422, + "loss": 0.0423, "macro_f1": 0.3272727429866791, "num_tokens": 1873338.0, "repeat_count": 0.0, - "routers_loss": 0.024936161935329437, + "routers_loss": 0.02250153198838234, "skip_count": 1.0, "step": 1160, "text_loss": 0.548884391784668 @@ -11037,13 +11037,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1923828125, + "grad_norm": 0.1484375, "learning_rate": 0.0009895721399064072, - "loss": 0.0407, + "loss": 0.0388, "macro_f1": 0.32098764181137085, "num_tokens": 1876470.0, "repeat_count": 1.0, - "routers_loss": 0.06472968310117722, + "routers_loss": 0.055204521864652634, "skip_count": 1.0, "step": 1162, "text_loss": 0.48052409291267395 @@ -11056,13 +11056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009895091628326564, - "loss": 0.031, + "loss": 0.0293, "macro_f1": 0.3333333432674408, "num_tokens": 1879354.0, "repeat_count": 0.0, - "routers_loss": 0.009633494541049004, + "routers_loss": 0.009093789383769035, "skip_count": 0.0, "step": 1164, "text_loss": 0.3908069431781769 @@ -11075,13 +11075,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.140625, "learning_rate": 0.000989445998178071, "loss": 0.0323, "macro_f1": 0.3272727429866791, "num_tokens": 1881941.0, "repeat_count": 0.0, - "routers_loss": 0.01458993274718523, + "routers_loss": 0.015086972154676914, "skip_count": 1.0, "step": 1166, "text_loss": 0.4884725511074066 @@ -11094,13 +11094,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.134765625, "learning_rate": 0.0009893826459668558, - "loss": 0.0389, + "loss": 0.0386, "macro_f1": 0.3144654333591461, "num_tokens": 1885374.0, "repeat_count": 0.0, - "routers_loss": 0.06636982411146164, + "routers_loss": 0.06587666273117065, "skip_count": 3.0, "step": 1168, "text_loss": 0.12760137021541595 @@ -11113,13 +11113,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1796875, + "grad_norm": 0.1591796875, "learning_rate": 0.0009893191062232873, - "loss": 0.0325, + "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1888612.0, "repeat_count": 0.0, - "routers_loss": 0.005644182674586773, + "routers_loss": 0.006088624242693186, "skip_count": 0.0, "step": 1170, "text_loss": 0.4821319580078125 @@ -11132,13 +11132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1279296875, "learning_rate": 0.0009892553789717143, - "loss": 0.0402, + "loss": 0.0389, "macro_f1": 0.3333333432674408, "num_tokens": 1891463.0, "repeat_count": 0.0, - "routers_loss": 0.010273848660290241, + "routers_loss": 0.010113578289747238, "skip_count": 0.0, "step": 1172, "text_loss": 0.3613642454147339 @@ -11151,13 +11151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009891914642365573, - "loss": 0.0415, + "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 1894230.0, "repeat_count": 0.0, - "routers_loss": 0.004529652185738087, + "routers_loss": 0.004947459790855646, "skip_count": 0.0, "step": 1174, "text_loss": 0.5037549138069153 @@ -11170,13 +11170,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2236328125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009891273620423083, - "loss": 0.045, + "loss": 0.0428, "macro_f1": 0.3272727429866791, "num_tokens": 1897294.0, "repeat_count": 1.0, - "routers_loss": 0.024671228602528572, + "routers_loss": 0.026075217872858047, "skip_count": 0.0, "step": 1176, "text_loss": 0.32558977603912354 @@ -11189,13 +11189,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009890630724135314, - "loss": 0.0354, + "loss": 0.0351, "macro_f1": 0.3272727429866791, "num_tokens": 1901553.0, "repeat_count": 0.0, - "routers_loss": 0.06466450542211533, + "routers_loss": 0.06650999188423157, "skip_count": 1.0, "step": 1178, "text_loss": 0.23473620414733887 @@ -11208,13 +11208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1767578125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009889985953748625, - "loss": 0.0278, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 1904556.0, "repeat_count": 0.0, - "routers_loss": 0.010566026903688908, + "routers_loss": 0.010361116379499435, "skip_count": 1.0, "step": 1180, "text_loss": 0.6927042007446289 @@ -11227,13 +11227,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.103515625, "learning_rate": 0.0009889339309510094, - "loss": 0.037, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 1908053.0, "repeat_count": 0.0, - "routers_loss": 0.013842248357832432, + "routers_loss": 0.013286533765494823, "skip_count": 0.0, "step": 1182, "text_loss": 0.19977325201034546 @@ -11246,13 +11246,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07373046875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009888690791667518, - "loss": 0.0215, + "loss": 0.0204, "macro_f1": 0.7018141150474548, "num_tokens": 1911754.0, "repeat_count": 2.0, - "routers_loss": 0.122759610414505, + "routers_loss": 0.11920545995235443, "skip_count": 3.0, "step": 1184, "text_loss": 0.4072858691215515 @@ -11265,32 +11265,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009888040400469408, - "loss": 0.0402, + "loss": 0.0391, "macro_f1": 0.3272727429866791, "num_tokens": 1914862.0, "repeat_count": 0.0, - "routers_loss": 0.035315629094839096, + "routers_loss": 0.03652849420905113, "skip_count": 1.0, "step": 1186, "text_loss": 0.2654043138027191 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.577634282359847, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, "learning_rate": 0.0009887388136164996, - "loss": 0.034, - "macro_f1": 0.32098764181137085, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, "num_tokens": 1918542.0, "repeat_count": 0.0, - "routers_loss": 0.040048226714134216, + "routers_loss": 0.03991910070180893, "skip_count": 2.0, "step": 1188, "text_loss": 0.21130657196044922 @@ -11298,18 +11298,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 5.587026709715292, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09521484375, "learning_rate": 0.000988673399900423, - "loss": 0.044, - "macro_f1": 0.3333333432674408, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, "num_tokens": 1921589.0, "repeat_count": 0.0, - "routers_loss": 0.012814820744097233, + "routers_loss": 0.014900135807693005, "skip_count": 0.0, "step": 1190, "text_loss": 0.5519335865974426 @@ -11322,13 +11322,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2119140625, + "grad_norm": 0.1884765625, "learning_rate": 0.0009886077989237777, - "loss": 0.0407, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 1924320.0, "repeat_count": 0.0, - "routers_loss": 0.05977959558367729, + "routers_loss": 0.06271552294492722, "skip_count": 1.0, "step": 1192, "text_loss": 0.213813915848732 @@ -11341,13 +11341,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.1533203125, + "grad_norm": 0.1875, "learning_rate": 0.000988542010711702, - "loss": 0.0334, + "loss": 0.0342, "macro_f1": 0.6225374937057495, "num_tokens": 1927178.0, "repeat_count": 0.0, - "routers_loss": 0.031448643654584885, + "routers_loss": 0.03081391751766205, "skip_count": 5.0, "step": 1194, "text_loss": 0.7524349093437195 @@ -11360,13 +11360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.265625, + "grad_norm": 0.255859375, "learning_rate": 0.0009884760352894064, - "loss": 0.0523, + "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 1930216.0, "repeat_count": 0.0, - "routers_loss": 0.008164947852492332, + "routers_loss": 0.008556773886084557, "skip_count": 0.0, "step": 1196, "text_loss": 0.28230375051498413 @@ -11379,32 +11379,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1064453125, "learning_rate": 0.0009884098726821726, - "loss": 0.0478, + "loss": 0.0472, "macro_f1": 0.4871794879436493, "num_tokens": 1933312.0, "repeat_count": 3.0, - "routers_loss": 0.04045635461807251, + "routers_loss": 0.05344727262854576, "skip_count": 0.0, "step": 1198, "text_loss": 0.5509607195854187 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 5.633988846492516, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.1240234375, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, "learning_rate": 0.000988343522915354, - "loss": 0.0447, - "macro_f1": 0.5866667032241821, + "loss": 0.0441, + "macro_f1": 0.480392187833786, "num_tokens": 1936160.0, "repeat_count": 1.0, - "routers_loss": 0.06872973591089249, + "routers_loss": 0.07324771583080292, "skip_count": 3.0, "step": 1200, "text_loss": 0.30565372109413147 @@ -11412,18 +11412,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, - "avg_layers": 24.0, + "avg_layers": 25.0, "epoch": 5.64338127384796, - "f1_execute": 0.8695651888847351, + "f1_execute": 0.8936169743537903, "f1_repeat": 0.0, - "f1_skip": 0.4000000059604645, - "grad_norm": 0.25390625, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, "learning_rate": 0.0009882769860143764, - "loss": 0.0331, - "macro_f1": 0.4231884181499481, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, "num_tokens": 1939266.0, "repeat_count": 0.0, - "routers_loss": 0.20964151620864868, + "routers_loss": 0.18620699644088745, "skip_count": 6.0, "step": 1202, "text_loss": 0.976121723651886 @@ -11442,26 +11442,26 @@ "macro_f1": 0.6666666865348816, "num_tokens": 1942173.0, "repeat_count": 0.0, - "routers_loss": 0.00690250750631094, + "routers_loss": 0.007703613489866257, "skip_count": 1.0, "step": 1204, "text_loss": 0.5647401809692383 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.66216612855885, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.14453125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, "learning_rate": 0.0009881433509120036, - "loss": 0.0372, - "macro_f1": 0.32098764181137085, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, "num_tokens": 1945071.0, "repeat_count": 0.0, - "routers_loss": 0.022315658628940582, + "routers_loss": 0.02162683941423893, "skip_count": 2.0, "step": 1206, "text_loss": 0.24229218065738678 @@ -11474,13 +11474,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1083984375, + "grad_norm": 0.0966796875, "learning_rate": 0.0009880762527618176, - "loss": 0.0388, + "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 1949060.0, "repeat_count": 0.0, - "routers_loss": 0.017015069723129272, + "routers_loss": 0.017667081207036972, "skip_count": 0.0, "step": 1208, "text_loss": 0.4035970866680145 @@ -11493,13 +11493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.154296875, "learning_rate": 0.0009880089675798908, - "loss": 0.0372, + "loss": 0.0367, "macro_f1": 0.3333333432674408, "num_tokens": 1951698.0, "repeat_count": 0.0, - "routers_loss": 0.006532609928399324, + "routers_loss": 0.006405784282833338, "skip_count": 0.0, "step": 1210, "text_loss": 0.5319879055023193 @@ -11512,13 +11512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009879414953920071, - "loss": 0.0301, + "loss": 0.0294, "macro_f1": 0.3333333432674408, "num_tokens": 1955266.0, "repeat_count": 0.0, - "routers_loss": 0.009720963425934315, + "routers_loss": 0.009859707206487656, "skip_count": 0.0, "step": 1212, "text_loss": 0.6687407493591309 @@ -11531,32 +11531,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.130859375, "learning_rate": 0.0009878738362240219, - "loss": 0.046, + "loss": 0.045, "macro_f1": 0.5492662787437439, "num_tokens": 1958538.0, "repeat_count": 0.0, - "routers_loss": 0.03176085278391838, + "routers_loss": 0.030890554189682007, "skip_count": 2.0, "step": 1214, "text_loss": 0.20820017158985138 }, { "acc_repeat": 0.5, - "acc_skip": 0.5, - "avg_layers": 29.0, + "acc_skip": 0.0, + "avg_layers": 30.0, "epoch": 5.709128265336073, - "f1_execute": 0.9387754797935486, + "f1_execute": 0.9200000166893005, "f1_repeat": 0.5, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.2021484375, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, "learning_rate": 0.000987805990101862, - "loss": 0.0323, - "macro_f1": 0.7018141150474548, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, "num_tokens": 1961419.0, "repeat_count": 2.0, - "routers_loss": 0.08626245707273483, + "routers_loss": 0.10383198410272598, "skip_count": 2.0, "step": 1216, "text_loss": 0.8664976358413696 @@ -11569,13 +11569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009877379570515268, - "loss": 0.0374, + "loss": 0.0366, "macro_f1": 0.3333333432674408, "num_tokens": 1964836.0, "repeat_count": 0.0, - "routers_loss": 0.012099343352019787, + "routers_loss": 0.013376163318753242, "skip_count": 0.0, "step": 1218, "text_loss": 0.4223395884037018 @@ -11588,13 +11588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.0859375, "learning_rate": 0.0009876697370990865, - "loss": 0.0342, + "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 1967620.0, "repeat_count": 0.0, - "routers_loss": 0.007713846862316132, + "routers_loss": 0.008577900938689709, "skip_count": 0.0, "step": 1220, "text_loss": 0.4789901375770569 @@ -11607,13 +11607,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009876013302706828, - "loss": 0.0499, + "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 1971100.0, "repeat_count": 0.0, - "routers_loss": 0.004629489034414291, + "routers_loss": 0.004730266984552145, "skip_count": 0.0, "step": 1222, "text_loss": 0.6799837946891785 @@ -11626,13 +11626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009875327365925295, - "loss": 0.035, + "loss": 0.0341, "macro_f1": 0.3333333432674408, "num_tokens": 1974408.0, "repeat_count": 0.0, - "routers_loss": 0.010654795914888382, + "routers_loss": 0.010849526152014732, "skip_count": 0.0, "step": 1224, "text_loss": 0.18967926502227783 @@ -11640,18 +11640,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 5.756090402113296, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19140625, + "grad_norm": 0.169921875, "learning_rate": 0.0009874639560909118, - "loss": 0.0516, - "macro_f1": 0.31446540355682373, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, "num_tokens": 1977046.0, "repeat_count": 0.0, - "routers_loss": 0.05963074415922165, + "routers_loss": 0.04841252416372299, "skip_count": 1.0, "step": 1226, "text_loss": 0.6133310198783875 @@ -11664,13 +11664,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1328125, + "grad_norm": 0.1318359375, "learning_rate": 0.0009873949887921867, - "loss": 0.04, + "loss": 0.0402, "macro_f1": 0.3272727429866791, "num_tokens": 1980330.0, "repeat_count": 0.0, - "routers_loss": 0.028920643031597137, + "routers_loss": 0.029638588428497314, "skip_count": 1.0, "step": 1228, "text_loss": 0.15649555623531342 @@ -11678,18 +11678,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 5.774875256824186, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "grad_norm": 0.1103515625, "learning_rate": 0.0009873258347227823, - "loss": 0.0327, - "macro_f1": 0.3333333432674408, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, "num_tokens": 1983173.0, "repeat_count": 0.0, - "routers_loss": 0.006852717138826847, + "routers_loss": 0.009955910965800285, "skip_count": 0.0, "step": 1230, "text_loss": 0.4741005599498749 @@ -11702,13 +11702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009872564939091989, - "loss": 0.0346, + "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1986825.0, "repeat_count": 0.0, - "routers_loss": 0.010968753136694431, + "routers_loss": 0.010205300524830818, "skip_count": 0.0, "step": 1232, "text_loss": 0.5315462350845337 @@ -11721,13 +11721,13 @@ "f1_execute": 0.9302325248718262, "f1_repeat": 1.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.1240234375, + "grad_norm": 0.11865234375, "learning_rate": 0.0009871869663780077, - "loss": 0.0344, + "loss": 0.0336, "macro_f1": 0.8858351111412048, "num_tokens": 1990448.0, "repeat_count": 1.0, - "routers_loss": 0.0906950980424881, + "routers_loss": 0.09120134264230728, "skip_count": 7.0, "step": 1234, "text_loss": 0.6187508702278137 @@ -11740,13 +11740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.125, "learning_rate": 0.0009871172521558522, - "loss": 0.0484, + "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 1993474.0, "repeat_count": 0.0, - "routers_loss": 0.016306072473526, + "routers_loss": 0.016188839450478554, "skip_count": 1.0, "step": 1236, "text_loss": 0.20783066749572754 @@ -11759,13 +11759,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.208984375, + "grad_norm": 0.216796875, "learning_rate": 0.0009870473512694465, - "loss": 0.038, + "loss": 0.0373, "macro_f1": 0.5934640765190125, "num_tokens": 1996536.0, "repeat_count": 0.0, - "routers_loss": 0.05804471671581268, + "routers_loss": 0.05046704784035683, "skip_count": 3.0, "step": 1238, "text_loss": 0.247748002409935 @@ -11773,18 +11773,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 5.821837393601409, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.091796875, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, "learning_rate": 0.0009869772637455772, - "loss": 0.0256, - "macro_f1": 0.5492662787437439, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, "num_tokens": 1999530.0, "repeat_count": 0.0, - "routers_loss": 0.045395996421575546, + "routers_loss": 0.044926248490810394, "skip_count": 2.0, "step": 1240, "text_loss": 0.26001980900764465 @@ -11797,13 +11797,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11767578125, + "grad_norm": 0.1513671875, "learning_rate": 0.000986906989611102, - "loss": 0.0438, + "loss": 0.0446, "macro_f1": 0.3272727429866791, "num_tokens": 2002782.0, "repeat_count": 0.0, - "routers_loss": 0.020834850147366524, + "routers_loss": 0.025911526754498482, "skip_count": 0.0, "step": 1242, "text_loss": 0.9009982943534851 @@ -11816,13 +11816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.115234375, "learning_rate": 0.0009868365288929492, - "loss": 0.0377, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2005331.0, "repeat_count": 0.0, - "routers_loss": 0.005241698585450649, + "routers_loss": 0.0043760035187006, "skip_count": 0.0, "step": 1244, "text_loss": 0.5547386407852173 @@ -11835,13 +11835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009867658816181206, - "loss": 0.038, + "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 2008115.0, "repeat_count": 0.0, - "routers_loss": 0.008387803100049496, + "routers_loss": 0.009227181784808636, "skip_count": 0.0, "step": 1246, "text_loss": 1.0067731142044067 @@ -11854,13 +11854,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.126953125, "learning_rate": 0.000986695047813688, - "loss": 0.0256, + "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2011137.0, "repeat_count": 1.0, - "routers_loss": 0.02261745184659958, + "routers_loss": 0.023822437971830368, "skip_count": 0.0, "step": 1248, "text_loss": 0.30058956146240234 @@ -11873,32 +11873,32 @@ "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.10693359375, + "grad_norm": 0.1044921875, "learning_rate": 0.0009866240275067948, - "loss": 0.0435, + "loss": 0.044, "macro_f1": 0.47333335876464844, "num_tokens": 2014159.0, "repeat_count": 2.0, - "routers_loss": 0.21678555011749268, + "routers_loss": 0.21523773670196533, "skip_count": 3.0, "step": 1250, "text_loss": 0.39072203636169434 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 5.878191957734077, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, "learning_rate": 0.0009865528207246563, - "loss": 0.0358, - "macro_f1": 0.32098764181137085, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, "num_tokens": 2017731.0, "repeat_count": 0.0, - "routers_loss": 0.06554054468870163, + "routers_loss": 0.06184682995080948, "skip_count": 2.0, "step": 1252, "text_loss": 0.35751575231552124 @@ -11911,13 +11911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.203125, + "grad_norm": 0.166015625, "learning_rate": 0.000986481427494559, - "loss": 0.0337, + "loss": 0.0336, "macro_f1": 0.3333333432674408, "num_tokens": 2020485.0, "repeat_count": 0.0, - "routers_loss": 0.007237187586724758, + "routers_loss": 0.007573372684419155, "skip_count": 0.0, "step": 1254, "text_loss": 0.4061077833175659 @@ -11930,13 +11930,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1845703125, + "grad_norm": 0.1708984375, "learning_rate": 0.000986409847843861, - "loss": 0.0387, + "loss": 0.0382, "macro_f1": 0.3272727429866791, "num_tokens": 2024149.0, "repeat_count": 1.0, - "routers_loss": 0.08003793656826019, + "routers_loss": 0.07447971403598785, "skip_count": 0.0, "step": 1256, "text_loss": 0.41876497864723206 @@ -11949,13 +11949,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.17578125, "learning_rate": 0.000986338081799992, - "loss": 0.0341, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 2026545.0, "repeat_count": 0.0, - "routers_loss": 0.006424390245229006, + "routers_loss": 0.006609147880226374, "skip_count": 0.0, "step": 1258, "text_loss": 0.4673794209957123 @@ -11968,13 +11968,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009862661293904523, - "loss": 0.0482, + "loss": 0.0498, "macro_f1": 0.32098764181137085, "num_tokens": 2029581.0, "repeat_count": 0.0, - "routers_loss": 0.10797854512929916, + "routers_loss": 0.10624702274799347, "skip_count": 2.0, "step": 1260, "text_loss": 0.3483233153820038 @@ -11987,13 +11987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1201171875, "learning_rate": 0.0009861939906428145, - "loss": 0.053, + "loss": 0.0525, "macro_f1": 0.3333333432674408, "num_tokens": 2033936.0, "repeat_count": 0.0, - "routers_loss": 0.006734046153724194, + "routers_loss": 0.007944886572659016, "skip_count": 0.0, "step": 1262, "text_loss": 0.16362667083740234 @@ -12006,13 +12006,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009861216655847225, - "loss": 0.0373, + "loss": 0.0376, "macro_f1": 0.6666666865348816, "num_tokens": 2037876.0, "repeat_count": 1.0, - "routers_loss": 0.00564212491735816, + "routers_loss": 0.007004092447459698, "skip_count": 0.0, "step": 1264, "text_loss": 0.43228110671043396 @@ -12025,13 +12025,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1044921875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009860491542438912, - "loss": 0.0472, + "loss": 0.047, "macro_f1": 0.3272727429866791, "num_tokens": 2040842.0, "repeat_count": 0.0, - "routers_loss": 0.026137735694646835, + "routers_loss": 0.026916226372122765, "skip_count": 1.0, "step": 1266, "text_loss": 0.5901188850402832 @@ -12044,13 +12044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0986328125, "learning_rate": 0.000985976456648107, - "loss": 0.0343, + "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2043890.0, "repeat_count": 0.0, - "routers_loss": 0.0069669694639742374, + "routers_loss": 0.007325216196477413, "skip_count": 0.0, "step": 1268, "text_loss": 0.8780109882354736 @@ -12063,13 +12063,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1142578125, + "grad_norm": 0.10205078125, "learning_rate": 0.000985903572825228, - "loss": 0.0323, + "loss": 0.0306, "macro_f1": 0.4871794879436493, "num_tokens": 2048848.0, "repeat_count": 0.0, - "routers_loss": 0.05618409812450409, + "routers_loss": 0.05007527023553848, "skip_count": 2.0, "step": 1270, "text_loss": 0.5863722562789917 @@ -12084,11 +12084,11 @@ "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.000985830502803183, - "loss": 0.0391, + "loss": 0.0396, "macro_f1": 0.3272727429866791, "num_tokens": 2051561.0, "repeat_count": 0.0, - "routers_loss": 0.025900620967149734, + "routers_loss": 0.023995524272322655, "skip_count": 0.0, "step": 1272, "text_loss": 0.7460709810256958 @@ -12101,13 +12101,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.10205078125, "learning_rate": 0.0009857572466099732, - "loss": 0.0426, + "loss": 0.0431, "macro_f1": 0.3333333432674408, "num_tokens": 2054752.0, "repeat_count": 0.0, - "routers_loss": 0.006236737594008446, + "routers_loss": 0.006928362417966127, "skip_count": 0.0, "step": 1274, "text_loss": 0.5130293369293213 @@ -12120,13 +12120,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.171875, + "grad_norm": 0.162109375, "learning_rate": 0.0009856838042736698, - "loss": 0.0503, + "loss": 0.0501, "macro_f1": 0.3333333432674408, "num_tokens": 2058151.0, "repeat_count": 0.0, - "routers_loss": 0.006367063149809837, + "routers_loss": 0.006969396956264973, "skip_count": 0.0, "step": 1276, "text_loss": 0.5911393761634827 @@ -12139,13 +12139,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.1357421875, "learning_rate": 0.0009856101758224166, - "loss": 0.0442, + "loss": 0.0441, "macro_f1": 0.3333333432674408, "num_tokens": 2061012.0, "repeat_count": 0.0, - "routers_loss": 0.003392914542928338, + "routers_loss": 0.003499418031424284, "skip_count": 0.0, "step": 1278, "text_loss": 0.25347545742988586 @@ -12158,13 +12158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0693359375, "learning_rate": 0.000985536361284428, - "loss": 0.0231, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2064597.0, "repeat_count": 0.0, - "routers_loss": 0.007376343477517366, + "routers_loss": 0.007856054231524467, "skip_count": 0.0, "step": 1280, "text_loss": 0.7476963400840759 @@ -12177,13 +12177,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009854623606879898, - "loss": 0.0243, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2067972.0, "repeat_count": 0.0, - "routers_loss": 0.02773376554250717, + "routers_loss": 0.02617792971432209, "skip_count": 1.0, "step": 1282, "text_loss": 0.5775872468948364 @@ -12196,13 +12196,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.09033203125, "learning_rate": 0.000985388174061459, - "loss": 0.0363, + "loss": 0.0356, "macro_f1": 0.32098767161369324, "num_tokens": 2071812.0, "repeat_count": 0.0, - "routers_loss": 0.03535797819495201, + "routers_loss": 0.035979997366666794, "skip_count": 1.0, "step": 1284, "text_loss": 0.2933400869369507 @@ -12215,13 +12215,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009853138014332646, - "loss": 0.0269, + "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 2074868.0, "repeat_count": 0.0, - "routers_loss": 0.004910993855446577, + "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 1286, "text_loss": 0.29085102677345276 @@ -12234,13 +12234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09033203125, "learning_rate": 0.0009852392428319058, - "loss": 0.0301, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 2078225.0, "repeat_count": 0.0, - "routers_loss": 0.0032444109674543142, + "routers_loss": 0.0032799106556922197, "skip_count": 0.0, "step": 1288, "text_loss": 0.7293626070022583 @@ -12253,13 +12253,13 @@ "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.0947265625, + "grad_norm": 0.08935546875, "learning_rate": 0.0009851644982859537, - "loss": 0.0272, + "loss": 0.0273, "macro_f1": 0.480392187833786, "num_tokens": 2081495.0, "repeat_count": 1.0, - "routers_loss": 0.12451831251382828, + "routers_loss": 0.12224318832159042, "skip_count": 3.0, "step": 1290, "text_loss": 0.26125892996788025 @@ -12272,13 +12272,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1435546875, "learning_rate": 0.0009850895678240508, - "loss": 0.0289, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2084390.0, "repeat_count": 1.0, - "routers_loss": 0.011074979789555073, + "routers_loss": 0.010662888176739216, "skip_count": 0.0, "step": 1292, "text_loss": 0.3510764539241791 @@ -12291,13 +12291,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1806640625, + "grad_norm": 0.1689453125, "learning_rate": 0.0009850144514749104, - "loss": 0.0336, + "loss": 0.0332, "macro_f1": 0.5492662787437439, "num_tokens": 2087210.0, "repeat_count": 0.0, - "routers_loss": 0.01774786226451397, + "routers_loss": 0.01979079470038414, "skip_count": 2.0, "step": 1294, "text_loss": 0.40202176570892334 @@ -12310,13 +12310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "grad_norm": 0.11669921875, "learning_rate": 0.000984939149267317, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.6666666865348816, "num_tokens": 2090777.0, "repeat_count": 0.0, - "routers_loss": 0.0052874404937028885, + "routers_loss": 0.005172552540898323, "skip_count": 1.0, "step": 1296, "text_loss": 0.5275651216506958 @@ -12329,13 +12329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10107421875, + "grad_norm": 0.095703125, "learning_rate": 0.0009848636612301272, - "loss": 0.031, + "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 2094248.0, "repeat_count": 0.0, - "routers_loss": 0.0034106262028217316, + "routers_loss": 0.0029599082190543413, "skip_count": 0.0, "step": 1298, "text_loss": 0.4517653286457062 @@ -12348,13 +12348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2177734375, + "grad_norm": 0.23046875, "learning_rate": 0.0009847879873922675, "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2097139.0, "repeat_count": 0.0, - "routers_loss": 0.010383229702711105, + "routers_loss": 0.011455860920250416, "skip_count": 0.0, "step": 1300, "text_loss": 0.16888445615768433 @@ -12367,13 +12367,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.09619140625, "learning_rate": 0.0009847121277827366, - "loss": 0.0304, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2100415.0, "repeat_count": 0.0, - "routers_loss": 0.0076674893498420715, + "routers_loss": 0.008091195486485958, "skip_count": 0.0, "step": 1302, "text_loss": 0.40061676502227783 @@ -12386,13 +12386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.1123046875, "learning_rate": 0.000984636082430604, - "loss": 0.0287, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2103285.0, "repeat_count": 0.0, - "routers_loss": 0.010486516170203686, + "routers_loss": 0.009593960829079151, "skip_count": 0.0, "step": 1304, "text_loss": 0.7211073637008667 @@ -12405,13 +12405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.107421875, "learning_rate": 0.0009845598513650103, - "loss": 0.0237, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2106255.0, "repeat_count": 0.0, - "routers_loss": 0.0023783023934811354, + "routers_loss": 0.0023068038281053305, "skip_count": 0.0, "step": 1306, "text_loss": 0.7077119946479797 @@ -12424,13 +12424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.181640625, + "grad_norm": 0.171875, "learning_rate": 0.0009844834346151674, - "loss": 0.044, + "loss": 0.043, "macro_f1": 0.3333333432674408, "num_tokens": 2109305.0, "repeat_count": 0.0, - "routers_loss": 0.006714595016092062, + "routers_loss": 0.007703019306063652, "skip_count": 0.0, "step": 1308, "text_loss": 0.3534316122531891 @@ -12443,13 +12443,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.1025390625, "learning_rate": 0.0009844068322103585, - "loss": 0.0281, + "loss": 0.0287, "macro_f1": 0.3272727429866791, "num_tokens": 2112216.0, "repeat_count": 0.0, - "routers_loss": 0.022373953834176064, + "routers_loss": 0.023549847304821014, "skip_count": 1.0, "step": 1310, "text_loss": 0.6792599558830261 @@ -12462,13 +12462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1513671875, + "grad_norm": 0.150390625, "learning_rate": 0.0009843300441799378, - "loss": 0.0205, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2114925.0, "repeat_count": 0.0, - "routers_loss": 0.007452849764376879, + "routers_loss": 0.007605871185660362, "skip_count": 0.0, "step": 1312, "text_loss": 0.1571389138698578 @@ -12481,13 +12481,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.134765625, "learning_rate": 0.0009842530705533304, - "loss": 0.0251, + "loss": 0.0253, "macro_f1": 0.3272727429866791, "num_tokens": 2117744.0, "repeat_count": 0.0, - "routers_loss": 0.016413308680057526, + "routers_loss": 0.014964760281145573, "skip_count": 0.0, "step": 1314, "text_loss": 0.7840361595153809 @@ -12500,13 +12500,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.10595703125, "learning_rate": 0.000984175911360033, - "loss": 0.0243, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2120848.0, "repeat_count": 0.0, - "routers_loss": 0.004676427226513624, + "routers_loss": 0.004663798492401838, "skip_count": 0.0, "step": 1316, "text_loss": 0.536246120929718 @@ -12519,13 +12519,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.123046875, + "grad_norm": 0.1201171875, "learning_rate": 0.000984098566629613, - "loss": 0.0284, + "loss": 0.0288, "macro_f1": 0.5492662787437439, "num_tokens": 2123651.0, "repeat_count": 0.0, - "routers_loss": 0.024454625323414803, + "routers_loss": 0.022852955386042595, "skip_count": 2.0, "step": 1318, "text_loss": 0.43372172117233276 @@ -12538,13 +12538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.07958984375, "learning_rate": 0.0009840210363917087, - "loss": 0.022, + "loss": 0.0216, "macro_f1": 0.3333333432674408, "num_tokens": 2128011.0, "repeat_count": 0.0, - "routers_loss": 0.013495884835720062, + "routers_loss": 0.012578422203660011, "skip_count": 0.0, "step": 1320, "text_loss": 0.28190380334854126 @@ -12557,13 +12557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.10986328125, "learning_rate": 0.0009839433206760306, - "loss": 0.0213, + "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2131035.0, "repeat_count": 0.0, - "routers_loss": 0.006397814955562353, + "routers_loss": 0.006863643880933523, "skip_count": 0.0, "step": 1322, "text_loss": 0.6340444087982178 @@ -12576,13 +12576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1796875, "learning_rate": 0.0009838654195123589, - "loss": 0.0246, + "loss": 0.0243, "macro_f1": 0.3333333432674408, "num_tokens": 2133856.0, "repeat_count": 0.0, - "routers_loss": 0.00503434706479311, + "routers_loss": 0.00468854233622551, "skip_count": 0.0, "step": 1324, "text_loss": 0.5138425827026367 @@ -12595,13 +12595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1240234375, + "grad_norm": 0.115234375, "learning_rate": 0.0009837873329305458, - "loss": 0.0402, + "loss": 0.0396, "macro_f1": 0.6666666865348816, "num_tokens": 2136451.0, "repeat_count": 1.0, - "routers_loss": 0.005150494631379843, + "routers_loss": 0.005731126759201288, "skip_count": 0.0, "step": 1326, "text_loss": 0.742124617099762 @@ -12614,13 +12614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1533203125, + "grad_norm": 0.17578125, "learning_rate": 0.000983709060960514, - "loss": 0.041, + "loss": 0.0416, "macro_f1": 0.3333333432674408, "num_tokens": 2139496.0, "repeat_count": 0.0, - "routers_loss": 0.004570818971842527, + "routers_loss": 0.0056343949399888515, "skip_count": 0.0, "step": 1328, "text_loss": 0.7317464351654053 @@ -12633,13 +12633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.10791015625, "learning_rate": 0.0009836306036322576, - "loss": 0.0314, + "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2143120.0, "repeat_count": 0.0, - "routers_loss": 0.005299333017319441, + "routers_loss": 0.005127966403961182, "skip_count": 0.0, "step": 1330, "text_loss": 0.538652241230011 @@ -12652,13 +12652,13 @@ "f1_execute": 0.9130434989929199, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.111328125, + "grad_norm": 0.11083984375, "learning_rate": 0.0009835519609758415, - "loss": 0.0303, + "loss": 0.0301, "macro_f1": 0.590062141418457, "num_tokens": 2145807.0, "repeat_count": 3.0, - "routers_loss": 0.168672576546669, + "routers_loss": 0.1673707216978073, "skip_count": 4.0, "step": 1332, "text_loss": 0.3498198091983795 @@ -12671,32 +12671,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1552734375, "learning_rate": 0.0009834731330214017, - "loss": 0.0302, + "loss": 0.0293, "macro_f1": 0.3272727429866791, "num_tokens": 2148397.0, "repeat_count": 1.0, - "routers_loss": 0.05187409743666649, + "routers_loss": 0.04026653990149498, "skip_count": 0.0, "step": 1334, "text_loss": 0.8153424859046936 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, "epoch": 6.272380393307896, - "f1_execute": 0.9230769276618958, + "f1_execute": 0.8999999761581421, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.9090909361839294, - "grad_norm": 0.1669921875, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, "learning_rate": 0.0009833941197991455, - "loss": 0.0339, - "macro_f1": 0.8329448699951172, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, "num_tokens": 2152226.0, "repeat_count": 2.0, - "routers_loss": 0.05786697566509247, + "routers_loss": 0.05481519177556038, "skip_count": 5.0, "step": 1336, "text_loss": 0.7802760004997253 @@ -12709,13 +12709,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.16796875, + "grad_norm": 0.1474609375, "learning_rate": 0.0009833149213393506, - "loss": 0.0315, + "loss": 0.0304, "macro_f1": 0.3272727429866791, "num_tokens": 2156023.0, "repeat_count": 0.0, - "routers_loss": 0.017055779695510864, + "routers_loss": 0.01760484278202057, "skip_count": 0.0, "step": 1338, "text_loss": 0.19721226394176483 @@ -12728,13 +12728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.11474609375, "learning_rate": 0.000983235537672366, - "loss": 0.0249, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2160037.0, "repeat_count": 0.0, - "routers_loss": 0.011614206247031689, + "routers_loss": 0.013206037692725658, "skip_count": 0.0, "step": 1340, "text_loss": 0.5003817081451416 @@ -12747,13 +12747,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1640625, + "grad_norm": 0.1474609375, "learning_rate": 0.000983155968828612, - "loss": 0.033, + "loss": 0.0315, "macro_f1": 0.6666666865348816, "num_tokens": 2163910.0, "repeat_count": 1.0, - "routers_loss": 0.012611300684511662, + "routers_loss": 0.01256406120955944, "skip_count": 0.0, "step": 1342, "text_loss": 0.5996923446655273 @@ -12766,13 +12766,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11962890625, "learning_rate": 0.0009830762148385793, - "loss": 0.0315, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2166921.0, "repeat_count": 0.0, - "routers_loss": 0.018757276237010956, + "routers_loss": 0.015086234547197819, "skip_count": 1.0, "step": 1344, "text_loss": 0.45356282591819763 @@ -12785,13 +12785,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08447265625, "learning_rate": 0.0009829962757328297, - "loss": 0.0229, + "loss": 0.0223, "macro_f1": 0.32098764181137085, "num_tokens": 2170135.0, "repeat_count": 0.0, - "routers_loss": 0.08197146654129028, + "routers_loss": 0.07909081131219864, "skip_count": 2.0, "step": 1346, "text_loss": 0.2874644994735718 @@ -12804,13 +12804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.068359375, "learning_rate": 0.0009829161515419959, - "loss": 0.0256, + "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2173029.0, "repeat_count": 0.0, - "routers_loss": 0.014122758992016315, + "routers_loss": 0.013569854199886322, "skip_count": 2.0, "step": 1348, "text_loss": 0.25533875823020935 @@ -12823,13 +12823,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009828358422967823, - "loss": 0.0221, + "loss": 0.0226, "macro_f1": 0.32098764181137085, "num_tokens": 2176605.0, "repeat_count": 1.0, - "routers_loss": 0.08215996623039246, + "routers_loss": 0.08111091703176498, "skip_count": 1.0, "step": 1350, "text_loss": 0.32827726006507874 @@ -12842,13 +12842,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.091796875, "learning_rate": 0.0009827553480279627, - "loss": 0.0312, + "loss": 0.03, "macro_f1": 0.5427350401878357, "num_tokens": 2179406.0, "repeat_count": 0.0, - "routers_loss": 0.026304977014660835, + "routers_loss": 0.026550088077783585, "skip_count": 2.0, "step": 1352, "text_loss": 0.2966301143169403 @@ -12861,13 +12861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009826746687663832, - "loss": 0.0302, + "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 2182353.0, "repeat_count": 0.0, - "routers_loss": 0.003616038942709565, + "routers_loss": 0.003914554137736559, "skip_count": 0.0, "step": 1354, "text_loss": 0.7596251964569092 @@ -12880,13 +12880,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0849609375, + "grad_norm": 0.0859375, "learning_rate": 0.0009825938045429602, - "loss": 0.0323, + "loss": 0.0324, "macro_f1": 0.5866667032241821, "num_tokens": 2185786.0, "repeat_count": 1.0, - "routers_loss": 0.060399893671274185, + "routers_loss": 0.059612665325403214, "skip_count": 3.0, "step": 1356, "text_loss": 0.12325898557901382 @@ -12899,13 +12899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.10009765625, "learning_rate": 0.0009825127553886807, - "loss": 0.0384, + "loss": 0.0375, "macro_f1": 0.3333333432674408, "num_tokens": 2190157.0, "repeat_count": 0.0, - "routers_loss": 0.007164204493165016, + "routers_loss": 0.0071132429875433445, "skip_count": 0.0, "step": 1358, "text_loss": 0.9287898540496826 @@ -12918,13 +12918,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0009824315213346033, - "loss": 0.0343, + "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 2193077.0, "repeat_count": 0.0, - "routers_loss": 0.010965060442686081, + "routers_loss": 0.009611099027097225, "skip_count": 0.0, "step": 1360, "text_loss": 0.20427259802818298 @@ -12937,13 +12937,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.10888671875, "learning_rate": 0.0009823501024118569, - "loss": 0.0276, + "loss": 0.0285, "macro_f1": 0.3333333432674408, "num_tokens": 2196494.0, "repeat_count": 0.0, - "routers_loss": 0.00784136913716793, + "routers_loss": 0.006913455203175545, "skip_count": 0.0, "step": 1362, "text_loss": 0.574759840965271 @@ -12956,13 +12956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009822684986516411, - "loss": 0.0251, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 2199839.0, "repeat_count": 0.0, - "routers_loss": 0.009101065807044506, + "routers_loss": 0.009208920411765575, "skip_count": 0.0, "step": 1364, "text_loss": 0.42422571778297424 @@ -12970,37 +12970,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 6.413266803639566, - "f1_execute": 0.9433962106704712, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000982186710085227, - "loss": 0.0206, - "macro_f1": 0.31446540355682373, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, "num_tokens": 2203212.0, "repeat_count": 1.0, - "routers_loss": 0.05967295169830322, + "routers_loss": 0.059975091367959976, "skip_count": 1.0, "step": 1366, "text_loss": 0.29213017225265503 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.25, + "avg_layers": 27.0, "epoch": 6.42265923099501, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1875, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, "learning_rate": 0.0009821047367439561, - "loss": 0.0356, - "macro_f1": 0.542222261428833, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, "num_tokens": 2206240.0, "repeat_count": 0.0, - "routers_loss": 0.05016552656888962, + "routers_loss": 0.048244867473840714, "skip_count": 4.0, "step": 1368, "text_loss": 0.3072395324707031 @@ -13013,13 +13013,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.107421875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009820225786592405, - "loss": 0.038, + "loss": 0.0375, "macro_f1": 0.3272727429866791, "num_tokens": 2209903.0, "repeat_count": 1.0, - "routers_loss": 0.02483060024678707, + "routers_loss": 0.026068156585097313, "skip_count": 0.0, "step": 1370, "text_loss": 0.5961400270462036 @@ -13032,13 +13032,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.109375, "learning_rate": 0.0009819402358625634, - "loss": 0.0373, + "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2213439.0, "repeat_count": 0.0, - "routers_loss": 0.01982821337878704, + "routers_loss": 0.022615568712353706, "skip_count": 1.0, "step": 1372, "text_loss": 0.19375644624233246 @@ -13051,13 +13051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.1240234375, "learning_rate": 0.000981857708385479, - "loss": 0.0353, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2216457.0, "repeat_count": 0.0, - "routers_loss": 0.004753436427563429, + "routers_loss": 0.005855285096913576, "skip_count": 0.0, "step": 1374, "text_loss": 0.5123368501663208 @@ -13070,13 +13070,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09912109375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009817749962596114, - "loss": 0.0246, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2219975.0, "repeat_count": 1.0, - "routers_loss": 0.06541594862937927, + "routers_loss": 0.0651634931564331, "skip_count": 0.0, "step": 1376, "text_loss": 0.5999220609664917 @@ -13089,13 +13089,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10498046875, + "grad_norm": 0.09912109375, "learning_rate": 0.0009816920995166568, - "loss": 0.0376, + "loss": 0.0371, "macro_f1": 0.6666666865348816, "num_tokens": 2222833.0, "repeat_count": 1.0, - "routers_loss": 0.01156456395983696, + "routers_loss": 0.011408994905650616, "skip_count": 0.0, "step": 1378, "text_loss": 0.5323230624198914 @@ -13108,13 +13108,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2392578125, + "grad_norm": 0.205078125, "learning_rate": 0.0009816090181883807, - "loss": 0.033, + "loss": 0.0313, "macro_f1": 0.32098764181137085, "num_tokens": 2225842.0, "repeat_count": 0.0, - "routers_loss": 0.05175521597266197, + "routers_loss": 0.039720915257930756, "skip_count": 2.0, "step": 1380, "text_loss": 0.23363439738750458 @@ -13127,13 +13127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.12255859375, "learning_rate": 0.0009815257523066204, - "loss": 0.0251, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 2229430.0, "repeat_count": 0.0, - "routers_loss": 0.002684591803699732, + "routers_loss": 0.002765297656878829, "skip_count": 0.0, "step": 1382, "text_loss": 0.718977689743042 @@ -13146,13 +13146,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.12890625, + "grad_norm": 0.130859375, "learning_rate": 0.0009814423019032835, - "loss": 0.0397, + "loss": 0.0396, "macro_f1": 0.5492662787437439, "num_tokens": 2232594.0, "repeat_count": 2.0, - "routers_loss": 0.054509978741407394, + "routers_loss": 0.05362323671579361, "skip_count": 0.0, "step": 1384, "text_loss": 0.6392166614532471 @@ -13165,13 +13165,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.150390625, "learning_rate": 0.0009813586670103483, "loss": 0.0426, "macro_f1": 0.6603773832321167, "num_tokens": 2236327.0, "repeat_count": 1.0, - "routers_loss": 0.04031623527407646, + "routers_loss": 0.031728316098451614, "skip_count": 1.0, "step": 1386, "text_loss": 0.5951619148254395 @@ -13184,13 +13184,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1142578125, + "grad_norm": 0.126953125, "learning_rate": 0.0009812748476598638, - "loss": 0.0308, + "loss": 0.031, "macro_f1": 0.5492662787437439, "num_tokens": 2239746.0, "repeat_count": 0.0, - "routers_loss": 0.039687711745500565, + "routers_loss": 0.03981253132224083, "skip_count": 2.0, "step": 1388, "text_loss": 0.22756551206111908 @@ -13203,13 +13203,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009811908438839498, - "loss": 0.0329, + "loss": 0.0331, "macro_f1": 0.5492662787437439, "num_tokens": 2242786.0, "repeat_count": 0.0, - "routers_loss": 0.04785723611712456, + "routers_loss": 0.04617162421345711, "skip_count": 2.0, "step": 1390, "text_loss": 0.3233799934387207 @@ -13222,13 +13222,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1298828125, + "grad_norm": 0.154296875, "learning_rate": 0.000981106655714797, - "loss": 0.0359, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2245696.0, "repeat_count": 0.0, - "routers_loss": 0.046765491366386414, + "routers_loss": 0.046828847378492355, "skip_count": 1.0, "step": 1392, "text_loss": 0.24273279309272766 @@ -13241,13 +13241,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009810222831846656, - "loss": 0.0303, + "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2249326.0, "repeat_count": 0.0, - "routers_loss": 0.015151665546000004, + "routers_loss": 0.010921589098870754, "skip_count": 2.0, "step": 1394, "text_loss": 0.3921460807323456 @@ -13260,13 +13260,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.09423828125, "learning_rate": 0.0009809377263258882, - "loss": 0.0321, + "loss": 0.0315, "macro_f1": 0.32098767161369324, "num_tokens": 2253393.0, "repeat_count": 0.0, - "routers_loss": 0.04431106895208359, + "routers_loss": 0.04564022272825241, "skip_count": 1.0, "step": 1396, "text_loss": 0.582602858543396 @@ -13279,13 +13279,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.103515625, "learning_rate": 0.000980852985170867, - "loss": 0.0317, + "loss": 0.0328, "macro_f1": 0.3272727429866791, "num_tokens": 2256626.0, "repeat_count": 0.0, - "routers_loss": 0.012700649909675121, + "routers_loss": 0.013289985246956348, "skip_count": 0.0, "step": 1398, "text_loss": 0.41031694412231445 @@ -13298,13 +13298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1591796875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009807680597520745, - "loss": 0.0256, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2259326.0, "repeat_count": 0.0, - "routers_loss": 0.005919010378420353, + "routers_loss": 0.0065213534981012344, "skip_count": 0.0, "step": 1400, "text_loss": 0.2888098657131195 @@ -13317,13 +13317,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2255859375, + "grad_norm": 0.23046875, "learning_rate": 0.0009806829501020546, - "loss": 0.0372, + "loss": 0.0358, "macro_f1": 0.3272727429866791, "num_tokens": 2262344.0, "repeat_count": 0.0, - "routers_loss": 0.04717765748500824, + "routers_loss": 0.04199840500950813, "skip_count": 1.0, "step": 1402, "text_loss": 0.31973034143447876 @@ -13336,13 +13336,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009805976562534215, "loss": 0.0317, "macro_f1": 0.6603773832321167, "num_tokens": 2266354.0, "repeat_count": 1.0, - "routers_loss": 0.015415813773870468, + "routers_loss": 0.015434930101037025, "skip_count": 1.0, "step": 1404, "text_loss": 0.508630633354187 @@ -13355,13 +13355,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.140625, "learning_rate": 0.0009805121782388599, "loss": 0.0339, "macro_f1": 0.6533333659172058, "num_tokens": 2269660.0, "repeat_count": 2.0, - "routers_loss": 0.06812979280948639, + "routers_loss": 0.0720924660563469, "skip_count": 2.0, "step": 1406, "text_loss": 0.40927737951278687 @@ -13374,13 +13374,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05908203125, + "grad_norm": 0.0634765625, "learning_rate": 0.0009804265160911253, - "loss": 0.0265, + "loss": 0.0266, "macro_f1": 0.5492662787437439, "num_tokens": 2273335.0, "repeat_count": 0.0, - "routers_loss": 0.025383235886693, + "routers_loss": 0.02400495670735836, "skip_count": 2.0, "step": 1408, "text_loss": 0.1777762621641159 @@ -13393,13 +13393,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1806640625, + "grad_norm": 0.2314453125, "learning_rate": 0.0009803406698430433, - "loss": 0.0367, + "loss": 0.0371, "macro_f1": 0.3272727429866791, "num_tokens": 2277107.0, "repeat_count": 0.0, - "routers_loss": 0.026493225246667862, + "routers_loss": 0.02560107782483101, "skip_count": 1.0, "step": 1410, "text_loss": 0.17955881357192993 @@ -13412,13 +13412,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009802546395275104, - "loss": 0.0342, + "loss": 0.0349, "macro_f1": 0.3333333432674408, "num_tokens": 2281638.0, "repeat_count": 0.0, - "routers_loss": 0.006616846192628145, + "routers_loss": 0.006655813194811344, "skip_count": 0.0, "step": 1412, "text_loss": 0.20882295072078705 @@ -13431,32 +13431,32 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0888671875, + "grad_norm": 0.08740234375, "learning_rate": 0.000980168425177494, - "loss": 0.0328, + "loss": 0.0342, "macro_f1": 0.8200000524520874, "num_tokens": 2284876.0, "repeat_count": 1.0, - "routers_loss": 0.060631848871707916, + "routers_loss": 0.06325097382068634, "skip_count": 3.0, "step": 1414, "text_loss": 0.26035264134407043 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.648077487525683, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1494140625, + "grad_norm": 0.138671875, "learning_rate": 0.000980082026826031, - "loss": 0.0317, - "macro_f1": 0.6666666865348816, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, "num_tokens": 2288938.0, "repeat_count": 1.0, - "routers_loss": 0.011199389584362507, + "routers_loss": 0.013436575420200825, "skip_count": 0.0, "step": 1416, "text_loss": 0.5502325892448425 @@ -13469,13 +13469,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009799954445062296, - "loss": 0.0192, + "loss": 0.0193, "macro_f1": 0.6603773832321167, "num_tokens": 2292317.0, "repeat_count": 1.0, - "routers_loss": 0.01120354700833559, + "routers_loss": 0.011264479719102383, "skip_count": 1.0, "step": 1418, "text_loss": 0.48075684905052185 @@ -13488,13 +13488,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.16796875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009799086782512686, - "loss": 0.0294, + "loss": 0.0292, "macro_f1": 0.5492662787437439, "num_tokens": 2295935.0, "repeat_count": 0.0, - "routers_loss": 0.030204148963093758, + "routers_loss": 0.02833271212875843, "skip_count": 2.0, "step": 1420, "text_loss": 0.18221206963062286 @@ -13507,13 +13507,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09375, "learning_rate": 0.0009798217280943967, - "loss": 0.0348, + "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2298927.0, "repeat_count": 0.0, - "routers_loss": 0.008244800381362438, + "routers_loss": 0.009208574891090393, "skip_count": 1.0, "step": 1422, "text_loss": 0.48686322569847107 @@ -13526,32 +13526,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.09423828125, "learning_rate": 0.0009797345940689335, - "loss": 0.0269, + "loss": 0.0267, "macro_f1": 0.3272727429866791, "num_tokens": 2301541.0, "repeat_count": 0.0, - "routers_loss": 0.015340043231844902, + "routers_loss": 0.015011847950518131, "skip_count": 0.0, "step": 1424, "text_loss": 0.49446266889572144 }, { "acc_repeat": 0.0, - "acc_skip": 0.6000000238418579, - "avg_layers": 25.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, "epoch": 6.695039624302906, - "f1_execute": 0.9583333134651184, + "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, - "f1_skip": 0.75, - "grad_norm": 0.1318359375, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, "learning_rate": 0.0009796472762082687, - "loss": 0.0341, - "macro_f1": 0.5694444179534912, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, "num_tokens": 2304589.0, "repeat_count": 0.0, - "routers_loss": 0.058681465685367584, + "routers_loss": 0.05912091210484505, "skip_count": 5.0, "step": 1426, "text_loss": 0.23945684731006622 @@ -13564,32 +13564,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09765625, "learning_rate": 0.000979559774545863, - "loss": 0.0423, + "loss": 0.0405, "macro_f1": 0.3272727429866791, "num_tokens": 2307860.0, "repeat_count": 0.0, - "routers_loss": 0.020810559391975403, + "routers_loss": 0.021242303773760796, "skip_count": 1.0, "step": 1428, "text_loss": 0.531273365020752 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 6.713824479013795, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.09033203125, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, "learning_rate": 0.000979472089115247, - "loss": 0.0268, - "macro_f1": 0.5492662787437439, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, "num_tokens": 2311581.0, "repeat_count": 0.0, - "routers_loss": 0.030001837760210037, + "routers_loss": 0.02768544852733612, "skip_count": 2.0, "step": 1430, "text_loss": 0.2497459501028061 @@ -13602,13 +13602,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1318359375, + "grad_norm": 0.12255859375, "learning_rate": 0.000979384219950022, - "loss": 0.034, + "loss": 0.0346, "macro_f1": 0.3333333432674408, "num_tokens": 2314639.0, "repeat_count": 0.0, - "routers_loss": 0.010381575673818588, + "routers_loss": 0.008678150363266468, "skip_count": 0.0, "step": 1432, "text_loss": 0.6579355001449585 @@ -13621,32 +13621,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.08056640625, "learning_rate": 0.0009792961670838595, - "loss": 0.0365, + "loss": 0.0362, "macro_f1": 0.3272727429866791, "num_tokens": 2317927.0, "repeat_count": 1.0, - "routers_loss": 0.03234704211354256, + "routers_loss": 0.03325597569346428, "skip_count": 0.0, "step": 1434, "text_loss": 0.5209436416625977 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.742001761080129, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1494140625, "learning_rate": 0.0009792079305505016, - "loss": 0.0303, - "macro_f1": 0.6666666865348816, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, "num_tokens": 2321065.0, "repeat_count": 1.0, - "routers_loss": 0.015481291338801384, + "routers_loss": 0.019228918477892876, "skip_count": 0.0, "step": 1436, "text_loss": 0.41087067127227783 @@ -13659,13 +13659,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1103515625, + "grad_norm": 0.10986328125, "learning_rate": 0.000979119510383761, - "loss": 0.0366, + "loss": 0.0371, "macro_f1": 0.3333333432674408, "num_tokens": 2323714.0, "repeat_count": 0.0, - "routers_loss": 0.018170451745390892, + "routers_loss": 0.017071325331926346, "skip_count": 0.0, "step": 1438, "text_loss": 0.21490029990673065 @@ -13678,13 +13678,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.162109375, + "grad_norm": 0.2060546875, "learning_rate": 0.00097903090661752, - "loss": 0.0306, + "loss": 0.0309, "macro_f1": 0.3333333432674408, "num_tokens": 2326454.0, "repeat_count": 0.0, - "routers_loss": 0.010385681875050068, + "routers_loss": 0.00991755723953247, "skip_count": 0.0, "step": 1440, "text_loss": 0.23847346007823944 @@ -13697,13 +13697,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.232421875, "learning_rate": 0.000978942119285732, - "loss": 0.0407, + "loss": 0.0404, "macro_f1": 0.3272727429866791, "num_tokens": 2329462.0, "repeat_count": 0.0, - "routers_loss": 0.04976538568735123, + "routers_loss": 0.04908733069896698, "skip_count": 1.0, "step": 1442, "text_loss": 0.23343028128147125 @@ -13716,13 +13716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1044921875, "learning_rate": 0.0009788531484224204, - "loss": 0.0255, + "loss": 0.0264, "macro_f1": 0.3333333432674408, "num_tokens": 2332146.0, "repeat_count": 0.0, - "routers_loss": 0.0030266831163316965, + "routers_loss": 0.0032628148328512907, "skip_count": 0.0, "step": 1444, "text_loss": 0.47423800826072693 @@ -13730,18 +13730,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 6.788963897857353, - "f1_execute": 0.9600000381469727, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.107421875, + "grad_norm": 0.10693359375, "learning_rate": 0.0009787639940616788, - "loss": 0.0411, - "macro_f1": 0.8200000524520874, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, "num_tokens": 2335738.0, "repeat_count": 1.0, - "routers_loss": 0.13420957326889038, + "routers_loss": 0.14336998760700226, "skip_count": 3.0, "step": 1446, "text_loss": 0.21837592124938965 @@ -13754,13 +13754,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1953125, + "grad_norm": 0.189453125, "learning_rate": 0.0009786746562376717, - "loss": 0.0251, + "loss": 0.0241, "macro_f1": 0.6666666865348816, "num_tokens": 2338488.0, "repeat_count": 0.0, - "routers_loss": 0.012779864482581615, + "routers_loss": 0.010542908683419228, "skip_count": 1.0, "step": 1448, "text_loss": 1.0614757537841797 @@ -13773,13 +13773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1728515625, "learning_rate": 0.0009785851349846334, - "loss": 0.0266, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2342074.0, "repeat_count": 0.0, - "routers_loss": 0.005545398220419884, + "routers_loss": 0.005998016335070133, "skip_count": 0.0, "step": 1450, "text_loss": 0.4269719421863556 @@ -13792,13 +13792,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09814453125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009784954303368686, - "loss": 0.0395, + "loss": 0.0384, "macro_f1": 0.44705885648727417, "num_tokens": 2345838.0, "repeat_count": 0.0, - "routers_loss": 0.0899835154414177, + "routers_loss": 0.0959126204252243, "skip_count": 3.0, "step": 1452, "text_loss": 0.3315916955471039 @@ -13811,13 +13811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1005859375, "learning_rate": 0.0009784055423287521, "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 2348939.0, "repeat_count": 0.0, - "routers_loss": 0.002738836221396923, + "routers_loss": 0.0025467623490840197, "skip_count": 0.0, "step": 1454, "text_loss": 0.6162732839584351 @@ -13830,13 +13830,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12060546875, + "grad_norm": 0.115234375, "learning_rate": 0.0009783154709947293, - "loss": 0.0266, + "loss": 0.0256, "macro_f1": 0.3272727429866791, "num_tokens": 2352232.0, "repeat_count": 0.0, - "routers_loss": 0.020522192120552063, + "routers_loss": 0.01860538125038147, "skip_count": 1.0, "step": 1456, "text_loss": 0.23928768932819366 @@ -13844,18 +13844,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 6.84531846199002, - "f1_execute": 0.9629629850387573, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009782252163693158, - "loss": 0.0197, - "macro_f1": 0.32098767161369324, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, "num_tokens": 2355159.0, "repeat_count": 0.0, - "routers_loss": 0.04245268926024437, + "routers_loss": 0.04412713274359703, "skip_count": 1.0, "step": 1458, "text_loss": 0.3371323347091675 @@ -13868,13 +13868,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.224609375, + "grad_norm": 0.21484375, "learning_rate": 0.0009781347784870973, - "loss": 0.0376, + "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 2358175.0, "repeat_count": 0.0, - "routers_loss": 0.009142685681581497, + "routers_loss": 0.006809141952544451, "skip_count": 0.0, "step": 1460, "text_loss": 0.547267735004425 @@ -13887,13 +13887,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.095703125, "learning_rate": 0.0009780441573827296, - "loss": 0.0295, + "loss": 0.03, "macro_f1": 0.3076923191547394, "num_tokens": 2360991.0, "repeat_count": 0.0, - "routers_loss": 0.08038893342018127, + "routers_loss": 0.08924390375614166, "skip_count": 4.0, "step": 1462, "text_loss": 0.7026563882827759 @@ -13906,13 +13906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.1865234375, "learning_rate": 0.000977953353090939, - "loss": 0.027, + "loss": 0.0272, "macro_f1": 0.3333333432674408, "num_tokens": 2363894.0, "repeat_count": 0.0, - "routers_loss": 0.02107175625860691, + "routers_loss": 0.021858472377061844, "skip_count": 0.0, "step": 1464, "text_loss": 0.2718065083026886 @@ -13925,13 +13925,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009778623656465219, - "loss": 0.0349, + "loss": 0.0338, "macro_f1": 0.32098764181137085, "num_tokens": 2367265.0, "repeat_count": 0.0, - "routers_loss": 0.042030055075883865, + "routers_loss": 0.044781096279621124, "skip_count": 0.0, "step": 1466, "text_loss": 0.5008095502853394 @@ -13944,13 +13944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009777711950843448, - "loss": 0.022, + "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2370186.0, "repeat_count": 0.0, - "routers_loss": 0.004230673424899578, + "routers_loss": 0.0040459707379341125, "skip_count": 0.0, "step": 1468, "text_loss": 0.5242461562156677 @@ -13963,13 +13963,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.134765625, "learning_rate": 0.0009776798414393446, - "loss": 0.0284, + "loss": 0.0279, "macro_f1": 0.6598639488220215, "num_tokens": 2373314.0, "repeat_count": 1.0, - "routers_loss": 0.06986775249242783, + "routers_loss": 0.0708528608083725, "skip_count": 3.0, "step": 1470, "text_loss": 0.2821732461452484 @@ -13982,13 +13982,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.142578125, + "grad_norm": 0.1328125, "learning_rate": 0.0009775883047465279, - "loss": 0.0431, + "loss": 0.0414, "macro_f1": 0.31446540355682373, "num_tokens": 2376435.0, "repeat_count": 1.0, - "routers_loss": 0.0439564548432827, + "routers_loss": 0.0290578193962574, "skip_count": 1.0, "step": 1472, "text_loss": 0.8438440561294556 @@ -14001,13 +14001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10546875, "learning_rate": 0.000977496585040972, - "loss": 0.0376, + "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2380244.0, "repeat_count": 0.0, - "routers_loss": 0.011889892630279064, + "routers_loss": 0.010360375046730042, "skip_count": 0.0, "step": 1474, "text_loss": 0.4356135427951813 @@ -14020,13 +14020,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.09912109375, "learning_rate": 0.000977404682357824, - "loss": 0.0295, + "loss": 0.0294, "macro_f1": 0.3272727429866791, "num_tokens": 2383498.0, "repeat_count": 0.0, - "routers_loss": 0.022536326199769974, + "routers_loss": 0.023518972098827362, "skip_count": 0.0, "step": 1476, "text_loss": 0.25195425748825073 @@ -14039,13 +14039,13 @@ "f1_execute": 0.9743589162826538, "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11181640625, "learning_rate": 0.000977312596732301, - "loss": 0.0388, + "loss": 0.0375, "macro_f1": 0.9544159770011902, "num_tokens": 2386414.0, "repeat_count": 5.0, - "routers_loss": 0.07959948480129242, + "routers_loss": 0.08190606534481049, "skip_count": 4.0, "step": 1478, "text_loss": 0.6586798429489136 @@ -14058,13 +14058,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.095703125, + "grad_norm": 0.10546875, "learning_rate": 0.0009772203281996905, - "loss": 0.0341, + "loss": 0.0336, "macro_f1": 1.0, "num_tokens": 2389399.0, "repeat_count": 1.0, - "routers_loss": 0.019112225621938705, + "routers_loss": 0.016441475600004196, "skip_count": 2.0, "step": 1480, "text_loss": 0.3671986758708954 @@ -14077,13 +14077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0888671875, + "grad_norm": 0.09814453125, "learning_rate": 0.0009771278767953502, - "loss": 0.0345, + "loss": 0.0357, "macro_f1": 0.3333333432674408, "num_tokens": 2392400.0, "repeat_count": 0.0, - "routers_loss": 0.018750866875052452, + "routers_loss": 0.019211363047361374, "skip_count": 0.0, "step": 1482, "text_loss": 0.27418580651283264 @@ -14096,32 +14096,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009770352425547072, - "loss": 0.0291, + "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 2395123.0, "repeat_count": 0.0, - "routers_loss": 0.015407348051667213, + "routers_loss": 0.015800386667251587, "skip_count": 0.0, "step": 1484, "text_loss": 0.19896622002124786 }, { - "acc_repeat": 0.6666666865348816, + "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, - "avg_layers": 30.0, + "avg_layers": 29.0, "epoch": 6.976812444966246, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.800000011920929, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12890625, "learning_rate": 0.0009769424255132596, - "loss": 0.0258, - "macro_f1": 0.5934640765190125, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, "num_tokens": 2397359.0, "repeat_count": 3.0, - "routers_loss": 0.06514479219913483, + "routers_loss": 0.06670158356428146, "skip_count": 0.0, "step": 1486, "text_loss": 0.4229799509048462 @@ -14134,13 +14134,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.111328125, + "grad_norm": 0.1162109375, "learning_rate": 0.0009768494257065747, - "loss": 0.0217, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 2400387.0, "repeat_count": 0.0, - "routers_loss": 0.013567833229899406, + "routers_loss": 0.011144762858748436, "skip_count": 1.0, "step": 1488, "text_loss": 0.4264226257801056 @@ -14153,13 +14153,13 @@ "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009767562431702904, - "loss": 0.0389, + "loss": 0.0387, "macro_f1": 0.3006536364555359, "num_tokens": 2403241.0, "repeat_count": 2.0, - "routers_loss": 0.13762018084526062, + "routers_loss": 0.12339717149734497, "skip_count": 3.0, "step": 1490, "text_loss": 0.2850193977355957 @@ -14172,13 +14172,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0009766628779401142, - "loss": 0.0214, + "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 2406087.0, "repeat_count": 0.0, - "routers_loss": 0.008640666492283344, + "routers_loss": 0.008174685761332512, "skip_count": 1.0, "step": 1492, "text_loss": 0.6756544709205627 @@ -14191,13 +14191,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.0673828125, "learning_rate": 0.000976569330051824, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 2409312.0, "repeat_count": 0.0, - "routers_loss": 0.0018257038900628686, + "routers_loss": 0.0021256296895444393, "skip_count": 0.0, "step": 1494, "text_loss": 0.4789894223213196 @@ -14210,13 +14210,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.053955078125, "learning_rate": 0.0009764755995412677, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 2412758.0, "repeat_count": 0.0, - "routers_loss": 0.003656312357634306, + "routers_loss": 0.003944927826523781, "skip_count": 0.0, "step": 1496, "text_loss": 0.5157490968704224 @@ -14229,13 +14229,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1005859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009763816864443627, - "loss": 0.0246, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2416079.0, "repeat_count": 1.0, - "routers_loss": 0.044268425554037094, + "routers_loss": 0.03893325850367546, "skip_count": 0.0, "step": 1498, "text_loss": 0.28045418858528137 @@ -14248,13 +14248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1279296875, "learning_rate": 0.0009762875907970968, - "loss": 0.0207, + "loss": 0.0199, "macro_f1": 0.3333333432674408, "num_tokens": 2420340.0, "repeat_count": 0.0, - "routers_loss": 0.0018966116476804018, + "routers_loss": 0.0017725443467497826, "skip_count": 0.0, "step": 1500, "text_loss": 0.35550856590270996 @@ -14267,32 +14267,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009761933126355277, - "loss": 0.0249, + "loss": 0.0245, "macro_f1": 0.3272727429866791, "num_tokens": 2424735.0, "repeat_count": 0.0, - "routers_loss": 0.01729201152920723, + "routers_loss": 0.01393749937415123, "skip_count": 1.0, "step": 1502, "text_loss": 0.38840189576148987 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 7.06105077781039, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.11962890625, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, "learning_rate": 0.0009760988519957828, - "loss": 0.0248, - "macro_f1": 0.5492662787437439, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, "num_tokens": 2428132.0, "repeat_count": 0.0, - "routers_loss": 0.01693531684577465, + "routers_loss": 0.01687910407781601, "skip_count": 2.0, "step": 1504, "text_loss": 0.3031681478023529 @@ -14305,13 +14305,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.064453125, "learning_rate": 0.0009760042089140598, - "loss": 0.0197, + "loss": 0.0193, "macro_f1": 0.3144654333591461, "num_tokens": 2431592.0, "repeat_count": 1.0, - "routers_loss": 0.04939094930887222, + "routers_loss": 0.04704280197620392, "skip_count": 2.0, "step": 1506, "text_loss": 0.16355200111865997 @@ -14324,13 +14324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009759093834266259, - "loss": 0.0213, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2434236.0, "repeat_count": 0.0, - "routers_loss": 0.0016892930725589395, + "routers_loss": 0.0016075772000476718, "skip_count": 0.0, "step": 1508, "text_loss": 0.6080073118209839 @@ -14343,13 +14343,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009758143755698186, - "loss": 0.0147, + "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2437170.0, "repeat_count": 0.0, - "routers_loss": 0.008671467192471027, + "routers_loss": 0.008451299741864204, "skip_count": 0.0, "step": 1510, "text_loss": 0.22100484371185303 @@ -14362,13 +14362,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009757191853800449, - "loss": 0.0228, + "loss": 0.0227, "macro_f1": 0.5866667032241821, "num_tokens": 2441187.0, "repeat_count": 1.0, - "routers_loss": 0.042682576924562454, + "routers_loss": 0.046565692871809006, "skip_count": 3.0, "step": 1512, "text_loss": 0.25098952651023865 @@ -14381,13 +14381,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.11279296875, "learning_rate": 0.000975623812893782, - "loss": 0.028, + "loss": 0.0276, "macro_f1": 0.3272727429866791, "num_tokens": 2444664.0, "repeat_count": 0.0, - "routers_loss": 0.02905822917819023, + "routers_loss": 0.02872578240931034, "skip_count": 1.0, "step": 1514, "text_loss": 0.4952253997325897 @@ -14400,13 +14400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1142578125, "learning_rate": 0.0009755282581475768, - "loss": 0.0223, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2447748.0, "repeat_count": 0.0, - "routers_loss": 0.0018810008186846972, + "routers_loss": 0.002055214950814843, "skip_count": 0.0, "step": 1516, "text_loss": 0.7465500831604004 @@ -14419,13 +14419,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.10302734375, "learning_rate": 0.000975432521178046, - "loss": 0.0219, + "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2450834.0, "repeat_count": 1.0, - "routers_loss": 0.04308714717626572, + "routers_loss": 0.04498551785945892, "skip_count": 0.0, "step": 1518, "text_loss": 0.28144413232803345 @@ -14438,13 +14438,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009753366020218763, - "loss": 0.0232, + "loss": 0.0234, "macro_f1": 0.3333333432674408, "num_tokens": 2454233.0, "repeat_count": 0.0, - "routers_loss": 0.003754811594262719, + "routers_loss": 0.003669742727652192, "skip_count": 0.0, "step": 1520, "text_loss": 0.5667551755905151 @@ -14457,32 +14457,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009752405007158238, - "loss": 0.0246, + "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2457331.0, "repeat_count": 0.0, - "routers_loss": 0.010853761807084084, + "routers_loss": 0.010455607436597347, "skip_count": 0.0, "step": 1522, "text_loss": 0.19575810432434082 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 7.154975051364837, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.0751953125, "learning_rate": 0.0009751442172967151, - "loss": 0.0196, - "macro_f1": 1.0, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, "num_tokens": 2459935.0, "repeat_count": 2.0, - "routers_loss": 0.015100379474461079, + "routers_loss": 0.025189083069562912, "skip_count": 1.0, "step": 1524, "text_loss": 0.45453405380249023 @@ -14495,13 +14495,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000975047751801446, - "loss": 0.0189, + "loss": 0.0187, "macro_f1": 0.3272727429866791, "num_tokens": 2463008.0, "repeat_count": 0.0, - "routers_loss": 0.011991916224360466, + "routers_loss": 0.012297490611672401, "skip_count": 0.0, "step": 1526, "text_loss": 0.31437572836875916 @@ -14514,32 +14514,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.1044921875, "learning_rate": 0.0009749511042669823, - "loss": 0.0226, + "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2466475.0, "repeat_count": 0.0, - "routers_loss": 0.008201062679290771, + "routers_loss": 0.011026266030967236, "skip_count": 0.0, "step": 1528, "text_loss": 0.46604859828948975 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.183152333431171, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1181640625, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, "learning_rate": 0.0009748542747303595, - "loss": 0.0174, - "macro_f1": 0.6666666865348816, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, "num_tokens": 2469320.0, "repeat_count": 0.0, - "routers_loss": 0.008513177745044231, + "routers_loss": 0.011934996582567692, "skip_count": 1.0, "step": 1530, "text_loss": 0.7764923572540283 @@ -14552,13 +14552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.0966796875, "learning_rate": 0.0009747572632286827, - "loss": 0.02, + "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 2472468.0, "repeat_count": 0.0, - "routers_loss": 0.004850955214351416, + "routers_loss": 0.005786920432001352, "skip_count": 0.0, "step": 1532, "text_loss": 0.3555782437324524 @@ -14571,32 +14571,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.087890625, + "grad_norm": 0.0849609375, "learning_rate": 0.0009746600697991271, - "loss": 0.0206, + "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2475736.0, "repeat_count": 1.0, - "routers_loss": 0.0027650354895740747, + "routers_loss": 0.0026990731712430716, "skip_count": 0.0, "step": 1534, "text_loss": 0.49561792612075806 }, { "acc_repeat": 1.0, - "acc_skip": 0.0, - "avg_layers": 29.0, + "acc_skip": 0.5, + "avg_layers": 28.0, "epoch": 7.2113296154975055, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, - "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, "learning_rate": 0.0009745626944789375, - "loss": 0.0209, - "macro_f1": 0.6538461446762085, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, "num_tokens": 2478887.0, "repeat_count": 1.0, - "routers_loss": 0.023268593475222588, + "routers_loss": 0.020221207290887833, "skip_count": 2.0, "step": 1536, "text_loss": 0.5375416278839111 @@ -14609,13 +14609,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.12158203125, "learning_rate": 0.0009744651373054279, "loss": 0.0286, "macro_f1": 0.3272727429866791, "num_tokens": 2481293.0, "repeat_count": 0.0, - "routers_loss": 0.031235001981258392, + "routers_loss": 0.03131086751818657, "skip_count": 1.0, "step": 1538, "text_loss": 0.5241039395332336 @@ -14628,13 +14628,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.080078125, + "grad_norm": 0.08984375, "learning_rate": 0.0009743673983159828, - "loss": 0.023, + "loss": 0.0241, "macro_f1": 0.6122449040412903, "num_tokens": 2484403.0, "repeat_count": 0.0, - "routers_loss": 0.042398080229759216, + "routers_loss": 0.04448170214891434, "skip_count": 4.0, "step": 1540, "text_loss": 0.7465724349021912 @@ -14647,13 +14647,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.099609375, + "grad_norm": 0.08935546875, "learning_rate": 0.0009742694775480557, - "loss": 0.0268, + "loss": 0.0265, "macro_f1": 0.6666666865348816, "num_tokens": 2487952.0, "repeat_count": 0.0, - "routers_loss": 0.007361465133726597, + "routers_loss": 0.007171491626650095, "skip_count": 1.0, "step": 1542, "text_loss": 0.2877117097377777 @@ -14666,13 +14666,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009741713750391703, - "loss": 0.0166, + "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2490815.0, "repeat_count": 1.0, - "routers_loss": 0.0052334014326334, + "routers_loss": 0.004559285007417202, "skip_count": 0.0, "step": 1544, "text_loss": 0.6097800135612488 @@ -14685,13 +14685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06787109375, "learning_rate": 0.0009740730908269193, "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 2494727.0, "repeat_count": 0.0, - "routers_loss": 0.004993532784283161, + "routers_loss": 0.005271553061902523, "skip_count": 0.0, "step": 1546, "text_loss": 0.5431114435195923 @@ -14704,13 +14704,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009739746249489658, - "loss": 0.0248, + "loss": 0.0239, "macro_f1": 0.3333333432674408, "num_tokens": 2499266.0, "repeat_count": 0.0, - "routers_loss": 0.001611889572814107, + "routers_loss": 0.0015409323386847973, "skip_count": 0.0, "step": 1548, "text_loss": 0.4702678322792053 @@ -14723,13 +14723,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11865234375, + "grad_norm": 0.1171875, "learning_rate": 0.0009738759774430417, - "loss": 0.0209, + "loss": 0.0216, "macro_f1": 0.32098764181137085, "num_tokens": 2502273.0, "repeat_count": 1.0, - "routers_loss": 0.03059260919690132, + "routers_loss": 0.030183158814907074, "skip_count": 1.0, "step": 1550, "text_loss": 0.3239189088344574 @@ -14742,32 +14742,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0498046875, "learning_rate": 0.0009737771483469493, - "loss": 0.0195, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2507624.0, "repeat_count": 0.0, - "routers_loss": 0.00508903618901968, + "routers_loss": 0.005410848651081324, "skip_count": 0.0, "step": 1552, "text_loss": 0.4014642834663391 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.295861461696507, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, + "f1_skip": 1.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009736781376985598, - "loss": 0.0174, - "macro_f1": 0.3272727429866791, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, "num_tokens": 2510366.0, "repeat_count": 0.0, - "routers_loss": 0.007860450074076653, + "routers_loss": 0.0066976165398955345, "skip_count": 1.0, "step": 1554, "text_loss": 0.5924848914146423 @@ -14780,13 +14780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11669921875, + "grad_norm": 0.13671875, "learning_rate": 0.0009735789455358144, - "loss": 0.0217, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2513317.0, "repeat_count": 0.0, - "routers_loss": 0.0027370608877390623, + "routers_loss": 0.002763477386906743, "skip_count": 0.0, "step": 1556, "text_loss": 0.3222943842411041 @@ -14799,13 +14799,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.11767578125, "learning_rate": 0.0009734795718967237, - "loss": 0.0276, + "loss": 0.0283, "macro_f1": 0.32098764181137085, "num_tokens": 2516628.0, "repeat_count": 0.0, - "routers_loss": 0.061584725975990295, + "routers_loss": 0.061566028743982315, "skip_count": 2.0, "step": 1558, "text_loss": 0.3249334692955017 @@ -14818,13 +14818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.095703125, "learning_rate": 0.0009733800168193679, "loss": 0.0228, "macro_f1": 1.0, "num_tokens": 2519424.0, "repeat_count": 2.0, - "routers_loss": 0.01694316789507866, + "routers_loss": 0.017976421862840652, "skip_count": 4.0, "step": 1560, "text_loss": 0.3341919481754303 @@ -14837,13 +14837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1826171875, "learning_rate": 0.0009732802803418966, - "loss": 0.0234, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2522922.0, "repeat_count": 0.0, - "routers_loss": 0.0023331891279667616, + "routers_loss": 0.002525332849472761, "skip_count": 0.0, "step": 1562, "text_loss": 0.3176332712173462 @@ -14856,13 +14856,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07861328125, "learning_rate": 0.0009731803625025292, - "loss": 0.0203, + "loss": 0.0196, "macro_f1": 0.3272727429866791, "num_tokens": 2525811.0, "repeat_count": 0.0, - "routers_loss": 0.021300682798027992, + "routers_loss": 0.015524424612522125, "skip_count": 1.0, "step": 1564, "text_loss": 0.532774031162262 @@ -14875,13 +14875,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.10205078125, "learning_rate": 0.0009730802633395541, - "loss": 0.026, + "loss": 0.0257, "macro_f1": 0.6603773832321167, "num_tokens": 2529157.0, "repeat_count": 1.0, - "routers_loss": 0.08335043489933014, + "routers_loss": 0.08138631284236908, "skip_count": 1.0, "step": 1566, "text_loss": 0.529487133026123 @@ -14894,13 +14894,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009729799828913298, - "loss": 0.0224, + "loss": 0.0223, "macro_f1": 0.3333333432674408, "num_tokens": 2532249.0, "repeat_count": 0.0, - "routers_loss": 0.003535634372383356, + "routers_loss": 0.0035867292899638414, "skip_count": 0.0, "step": 1568, "text_loss": 0.503160297870636 @@ -14913,13 +14913,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009728795211962838, "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2535904.0, "repeat_count": 0.0, - "routers_loss": 0.025729363784193993, + "routers_loss": 0.02987455204129219, "skip_count": 2.0, "step": 1570, "text_loss": 0.9170270562171936 @@ -14932,13 +14932,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.11865234375, "learning_rate": 0.0009727788782929131, - "loss": 0.0287, + "loss": 0.0273, "macro_f1": 0.3272727429866791, "num_tokens": 2538943.0, "repeat_count": 1.0, - "routers_loss": 0.059166863560676575, + "routers_loss": 0.04676021635532379, "skip_count": 0.0, "step": 1572, "text_loss": 0.29146310687065125 @@ -14951,13 +14951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009726780542197844, - "loss": 0.0173, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2541805.0, "repeat_count": 0.0, - "routers_loss": 0.002580022206529975, + "routers_loss": 0.002127803163602948, "skip_count": 0.0, "step": 1574, "text_loss": 1.0126502513885498 @@ -14970,13 +14970,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.138671875, + "grad_norm": 0.142578125, "learning_rate": 0.0009725770490155338, - "loss": 0.0257, + "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2546213.0, "repeat_count": 0.0, - "routers_loss": 0.007746981456875801, + "routers_loss": 0.007609677035361528, "skip_count": 0.0, "step": 1576, "text_loss": 0.190168559551239 @@ -14989,13 +14989,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.083984375, "learning_rate": 0.0009724758627188665, - "loss": 0.0344, + "loss": 0.0356, "macro_f1": 0.3272727429866791, "num_tokens": 2549554.0, "repeat_count": 0.0, - "routers_loss": 0.027308562770485878, + "routers_loss": 0.033554721623659134, "skip_count": 1.0, "step": 1578, "text_loss": 0.2977406084537506 @@ -15008,13 +15008,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.14453125, + "grad_norm": 0.140625, "learning_rate": 0.0009723744953685572, - "loss": 0.0277, + "loss": 0.028, "macro_f1": 0.3272727429866791, "num_tokens": 2552785.0, "repeat_count": 1.0, - "routers_loss": 0.029863199219107628, + "routers_loss": 0.027864238247275352, "skip_count": 0.0, "step": 1580, "text_loss": 0.2700682580471039 @@ -15027,13 +15027,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.19921875, "learning_rate": 0.0009722729470034503, - "loss": 0.0218, + "loss": 0.0224, "macro_f1": 0.3333333432674408, "num_tokens": 2556550.0, "repeat_count": 0.0, - "routers_loss": 0.004019706044346094, + "routers_loss": 0.004798175301402807, "skip_count": 0.0, "step": 1582, "text_loss": 0.6559903025627136 @@ -15046,32 +15046,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.078125, "learning_rate": 0.0009721712176624591, - "loss": 0.0239, + "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2559862.0, "repeat_count": 0.0, - "routers_loss": 0.014162382110953331, + "routers_loss": 0.013764148578047752, "skip_count": 0.0, "step": 1584, "text_loss": 0.2257535308599472 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 0.5, + "avg_layers": 27.0, "epoch": 7.446140299383622, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, "learning_rate": 0.0009720693073845667, - "loss": 0.0338, - "macro_f1": 0.32098764181137085, + "loss": 0.032, + "macro_f1": 0.5492662787437439, "num_tokens": 2562766.0, "repeat_count": 0.0, - "routers_loss": 0.023485012352466583, + "routers_loss": 0.01937069371342659, "skip_count": 2.0, "step": 1586, "text_loss": 0.178413525223732 @@ -15079,37 +15079,37 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 7.455532726739067, - "f1_execute": 0.9818181991577148, + "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.126953125, + "grad_norm": 0.150390625, "learning_rate": 0.0009719672162088252, - "loss": 0.0308, - "macro_f1": 0.3272727429866791, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, "num_tokens": 2566583.0, "repeat_count": 1.0, - "routers_loss": 0.05822715163230896, + "routers_loss": 0.06224144622683525, "skip_count": 0.0, "step": 1588, "text_loss": 0.3992367684841156 }, { - "acc_repeat": 0.5, - "acc_skip": 0.5, + "acc_repeat": 1.0, + "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 7.464925154094511, - "f1_execute": 0.936170220375061, - "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.189453125, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, "learning_rate": 0.0009718649441743559, - "loss": 0.0243, - "macro_f1": 0.7565011978149414, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, "num_tokens": 2569516.0, "repeat_count": 2.0, - "routers_loss": 0.07448136061429977, + "routers_loss": 0.06937911361455917, "skip_count": 4.0, "step": 1590, "text_loss": 0.1945122629404068 @@ -15122,13 +15122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.00097176249132035, - "loss": 0.0228, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2572418.0, "repeat_count": 0.0, - "routers_loss": 0.0038424162194132805, + "routers_loss": 0.0034326619934290648, "skip_count": 0.0, "step": 1592, "text_loss": 0.6259906888008118 @@ -15141,13 +15141,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08642578125, "learning_rate": 0.0009716598576860676, - "loss": 0.0277, + "loss": 0.0278, "macro_f1": 0.6666666865348816, "num_tokens": 2575235.0, "repeat_count": 1.0, - "routers_loss": 0.005674343090504408, + "routers_loss": 0.004557516425848007, "skip_count": 0.0, "step": 1594, "text_loss": 0.6638736724853516 @@ -15160,13 +15160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.189453125, + "grad_norm": 0.193359375, "learning_rate": 0.0009715570433108378, - "loss": 0.0209, + "loss": 0.0198, "macro_f1": 1.0, "num_tokens": 2578157.0, "repeat_count": 1.0, - "routers_loss": 0.015544800087809563, + "routers_loss": 0.015363055281341076, "skip_count": 1.0, "step": 1596, "text_loss": 0.6530464887619019 @@ -15179,13 +15179,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1572265625, + "grad_norm": 0.1484375, "learning_rate": 0.0009714540482340595, - "loss": 0.0279, + "loss": 0.0268, "macro_f1": 0.6666666865348816, "num_tokens": 2581801.0, "repeat_count": 1.0, - "routers_loss": 0.013199405744671822, + "routers_loss": 0.01257144846022129, "skip_count": 0.0, "step": 1598, "text_loss": 0.5916110277175903 @@ -15198,13 +15198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009713508724952006, - "loss": 0.0178, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2585204.0, "repeat_count": 0.0, - "routers_loss": 0.0032487998250871897, + "routers_loss": 0.003175645601004362, "skip_count": 0.0, "step": 1600, "text_loss": 0.27901601791381836 @@ -15217,13 +15217,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12255859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0009712475161337981, - "loss": 0.0253, + "loss": 0.0261, "macro_f1": 0.3333333432674408, "num_tokens": 2588286.0, "repeat_count": 0.0, - "routers_loss": 0.0041928659193217754, + "routers_loss": 0.004122321493923664, "skip_count": 0.0, "step": 1602, "text_loss": 0.42420244216918945 @@ -15236,13 +15236,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07470703125, "learning_rate": 0.0009711439791894585, - "loss": 0.0343, + "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 2591476.0, "repeat_count": 0.0, - "routers_loss": 0.011576149612665176, + "routers_loss": 0.011215819045901299, "skip_count": 1.0, "step": 1604, "text_loss": 0.5549933910369873 @@ -15255,13 +15255,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0009710402617018574, - "loss": 0.0179, + "loss": 0.0172, "macro_f1": 0.8200000524520874, "num_tokens": 2594336.0, "repeat_count": 1.0, - "routers_loss": 0.03026912547647953, + "routers_loss": 0.02916567400097847, "skip_count": 2.0, "step": 1606, "text_loss": 0.3263779282569885 @@ -15276,11 +15276,11 @@ "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.0009709363637107393, - "loss": 0.021, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 2597462.0, "repeat_count": 0.0, - "routers_loss": 0.014957098290324211, + "routers_loss": 0.015897957608103752, "skip_count": 1.0, "step": 1608, "text_loss": 0.20917139947414398 @@ -15293,13 +15293,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.1611328125, "learning_rate": 0.0009708322852559184, - "loss": 0.0226, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2601543.0, "repeat_count": 0.0, - "routers_loss": 0.00254683755338192, + "routers_loss": 0.002211357234045863, "skip_count": 0.0, "step": 1610, "text_loss": 0.450550377368927 @@ -15312,13 +15312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1748046875, + "grad_norm": 0.1728515625, "learning_rate": 0.0009707280263772776, - "loss": 0.0286, + "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2604462.0, "repeat_count": 0.0, - "routers_loss": 0.018759876489639282, + "routers_loss": 0.01615734025835991, "skip_count": 2.0, "step": 1612, "text_loss": 0.6908381581306458 @@ -15337,7 +15337,7 @@ "macro_f1": 0.5492662787437439, "num_tokens": 2607484.0, "repeat_count": 0.0, - "routers_loss": 0.022694367915391922, + "routers_loss": 0.022048067301511765, "skip_count": 2.0, "step": 1614, "text_loss": 0.36691340804100037 @@ -15350,13 +15350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10546875, "learning_rate": 0.0009705189675084138, - "loss": 0.0181, + "loss": 0.0176, "macro_f1": 0.6666666865348816, "num_tokens": 2610204.0, "repeat_count": 0.0, - "routers_loss": 0.010102321393787861, + "routers_loss": 0.008503952994942665, "skip_count": 1.0, "step": 1616, "text_loss": 0.5226598381996155 @@ -15369,13 +15369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08984375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009704141675983029, - "loss": 0.0252, + "loss": 0.0248, "macro_f1": 0.3333333432674408, "num_tokens": 2613128.0, "repeat_count": 0.0, - "routers_loss": 0.0020994991064071655, + "routers_loss": 0.0019020626787096262, "skip_count": 0.0, "step": 1618, "text_loss": 0.6465088725090027 @@ -15388,13 +15388,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, - "grad_norm": 0.10009765625, + "grad_norm": 0.107421875, "learning_rate": 0.0009703091874245956, - "loss": 0.0323, + "loss": 0.032, "macro_f1": 0.5535354018211365, "num_tokens": 2616360.0, "repeat_count": 0.0, - "routers_loss": 0.11748704314231873, + "routers_loss": 0.11837691068649292, "skip_count": 7.0, "step": 1620, "text_loss": 0.2987039089202881 @@ -15407,32 +15407,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009702040270275204, - "loss": 0.018, + "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2619606.0, "repeat_count": 0.0, - "routers_loss": 0.007642311509698629, + "routers_loss": 0.0065958453342318535, "skip_count": 0.0, "step": 1622, "text_loss": 0.6262096166610718 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 7.62459641913707, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.10595703125, + "f1_skip": 1.0, + "grad_norm": 0.103515625, "learning_rate": 0.000970098686447375, - "loss": 0.0258, - "macro_f1": 0.3272727429866791, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, "num_tokens": 2622499.0, "repeat_count": 0.0, - "routers_loss": 0.016890225932002068, + "routers_loss": 0.013632026500999928, "skip_count": 1.0, "step": 1624, "text_loss": 0.2392602562904358 @@ -15445,13 +15445,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1240234375, + "grad_norm": 0.125, "learning_rate": 0.0009699931657245264, - "loss": 0.0242, + "loss": 0.0245, "macro_f1": 0.5492662787437439, "num_tokens": 2626002.0, "repeat_count": 0.0, - "routers_loss": 0.010900186374783516, + "routers_loss": 0.012147823348641396, "skip_count": 2.0, "step": 1626, "text_loss": 0.4742976129055023 @@ -15464,13 +15464,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009698874648994098, - "loss": 0.0279, + "loss": 0.0285, "macro_f1": 1.0, "num_tokens": 2629847.0, "repeat_count": 1.0, - "routers_loss": 0.011229799129068851, + "routers_loss": 0.010692884214222431, "skip_count": 3.0, "step": 1628, "text_loss": 0.5090685486793518 @@ -15483,13 +15483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.1240234375, "learning_rate": 0.0009697815840125304, - "loss": 0.0275, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2633529.0, "repeat_count": 0.0, - "routers_loss": 0.0105878422036767, + "routers_loss": 0.011442207731306553, "skip_count": 0.0, "step": 1630, "text_loss": 0.1874329298734665 @@ -15502,13 +15502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.2021484375, + "grad_norm": 0.2119140625, "learning_rate": 0.0009696755231044618, - "loss": 0.0209, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2636321.0, "repeat_count": 0.0, - "routers_loss": 0.002953991526737809, + "routers_loss": 0.0026681360322982073, "skip_count": 0.0, "step": 1632, "text_loss": 0.7650400400161743 @@ -15521,13 +15521,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10888671875, + "grad_norm": 0.10498046875, "learning_rate": 0.0009695692822158466, - "loss": 0.0241, + "loss": 0.0242, "macro_f1": 0.3272727429866791, "num_tokens": 2638840.0, "repeat_count": 1.0, - "routers_loss": 0.04717390984296799, + "routers_loss": 0.033965807408094406, "skip_count": 0.0, "step": 1634, "text_loss": 0.6175784468650818 @@ -15540,13 +15540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009694628613873968, - "loss": 0.0179, + "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2641886.0, "repeat_count": 0.0, - "routers_loss": 0.0073657832108438015, + "routers_loss": 0.007568214554339647, "skip_count": 0.0, "step": 1636, "text_loss": 0.43139931559562683 @@ -15559,13 +15559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.193359375, "learning_rate": 0.0009693562606598929, - "loss": 0.0259, + "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 2645028.0, "repeat_count": 0.0, - "routers_loss": 0.005212752148509026, + "routers_loss": 0.004973865579813719, "skip_count": 0.0, "step": 1638, "text_loss": 0.6430339217185974 @@ -15578,13 +15578,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.06982421875, "learning_rate": 0.0009692494800741844, - "loss": 0.0304, + "loss": 0.0313, "macro_f1": 0.3272727429866791, "num_tokens": 2648209.0, "repeat_count": 1.0, - "routers_loss": 0.04311618581414223, + "routers_loss": 0.049863800406455994, "skip_count": 0.0, "step": 1640, "text_loss": 0.28138160705566406 @@ -15597,13 +15597,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08544921875, "learning_rate": 0.0009691425196711901, - "loss": 0.039, + "loss": 0.0398, "macro_f1": 0.3272727429866791, "num_tokens": 2651171.0, "repeat_count": 0.0, - "routers_loss": 0.02027471922338009, + "routers_loss": 0.02112230286002159, "skip_count": 0.0, "step": 1642, "text_loss": 0.3745322525501251 @@ -15616,13 +15616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0703125, "learning_rate": 0.0009690353794918971, - "loss": 0.0279, + "loss": 0.0275, "macro_f1": 0.3333333432674408, "num_tokens": 2654093.0, "repeat_count": 0.0, - "routers_loss": 0.003074956126511097, + "routers_loss": 0.0024304776452481747, "skip_count": 0.0, "step": 1644, "text_loss": 0.4275154173374176 @@ -15635,13 +15635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.0771484375, "learning_rate": 0.000968928059577362, - "loss": 0.0241, + "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 2657079.0, "repeat_count": 0.0, - "routers_loss": 0.009374706074595451, + "routers_loss": 0.009320619516074657, "skip_count": 1.0, "step": 1646, "text_loss": 0.46650025248527527 @@ -15654,13 +15654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1162109375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009688205599687099, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.3272727429866791, "num_tokens": 2660951.0, "repeat_count": 0.0, - "routers_loss": 0.01204691268503666, + "routers_loss": 0.011913162656128407, "skip_count": 0.0, "step": 1648, "text_loss": 0.46644100546836853 @@ -15673,13 +15673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.1083984375, "learning_rate": 0.0009687128807071347, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 2663823.0, "repeat_count": 0.0, - "routers_loss": 0.01376053225249052, + "routers_loss": 0.013754756189882755, "skip_count": 0.0, "step": 1650, "text_loss": 0.40808847546577454 @@ -15692,13 +15692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.103515625, "learning_rate": 0.0009686050218338996, - "loss": 0.0285, + "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 2667079.0, "repeat_count": 0.0, - "routers_loss": 0.009346984326839447, + "routers_loss": 0.009099726565182209, "skip_count": 0.0, "step": 1652, "text_loss": 0.2389989197254181 @@ -15711,13 +15711,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.08837890625, "learning_rate": 0.0009684969833903359, - "loss": 0.0291, + "loss": 0.0283, "macro_f1": 0.6666666865348816, "num_tokens": 2670162.0, "repeat_count": 0.0, - "routers_loss": 0.002724624238908291, + "routers_loss": 0.0034928603563457727, "skip_count": 1.0, "step": 1654, "text_loss": 0.6930749416351318 @@ -15730,13 +15730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.123046875, + "grad_norm": 0.10888671875, "learning_rate": 0.0009683887654178445, - "loss": 0.0271, + "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 2673031.0, "repeat_count": 0.0, - "routers_loss": 0.00823777075856924, + "routers_loss": 0.008340462110936642, "skip_count": 1.0, "step": 1656, "text_loss": 0.277752548456192 @@ -15749,32 +15749,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009682803679578947, - "loss": 0.0262, + "loss": 0.0259, "macro_f1": 0.3333333432674408, "num_tokens": 2676092.0, "repeat_count": 0.0, - "routers_loss": 0.004393119364976883, + "routers_loss": 0.004337446764111519, "skip_count": 0.0, "step": 1658, "text_loss": 0.5176776051521301 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 7.7936601115350745, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1513671875, + "f1_skip": 0.0, + "grad_norm": 0.169921875, "learning_rate": 0.0009681717910520244, - "loss": 0.024, - "macro_f1": 0.5492662787437439, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, "num_tokens": 2679479.0, "repeat_count": 0.0, - "routers_loss": 0.031827569007873535, + "routers_loss": 0.034611742943525314, "skip_count": 2.0, "step": 1660, "text_loss": 0.21485982835292816 @@ -15789,11 +15789,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.0009680630347418406, - "loss": 0.0216, + "loss": 0.022, "macro_f1": 0.5492662787437439, "num_tokens": 2683289.0, "repeat_count": 0.0, - "routers_loss": 0.03329647704958916, + "routers_loss": 0.03297121450304985, "skip_count": 2.0, "step": 1662, "text_loss": 0.33801013231277466 @@ -15806,13 +15806,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1826171875, + "grad_norm": 0.1728515625, "learning_rate": 0.000967954099069019, - "loss": 0.0415, + "loss": 0.0411, "macro_f1": 0.32098764181137085, "num_tokens": 2685879.0, "repeat_count": 1.0, - "routers_loss": 0.047317031770944595, + "routers_loss": 0.04551183059811592, "skip_count": 1.0, "step": 1664, "text_loss": 0.41123488545417786 @@ -15827,11 +15827,11 @@ "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009678449840753038, - "loss": 0.0325, + "loss": 0.0324, "macro_f1": 0.32098764181137085, "num_tokens": 2688910.0, "repeat_count": 0.0, - "routers_loss": 0.05649980902671814, + "routers_loss": 0.05866450071334839, "skip_count": 2.0, "step": 1666, "text_loss": 0.1740892380475998 @@ -15844,13 +15844,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.09228515625, "learning_rate": 0.0009677356898025082, - "loss": 0.0229, + "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2691680.0, "repeat_count": 0.0, - "routers_loss": 0.01004624180495739, + "routers_loss": 0.009243223816156387, "skip_count": 0.0, "step": 1668, "text_loss": 0.2512350380420685 @@ -15863,13 +15863,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09619140625, "learning_rate": 0.000967626216292514, - "loss": 0.0194, + "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2694895.0, "repeat_count": 0.0, - "routers_loss": 0.0054973396472632885, + "routers_loss": 0.005576452240347862, "skip_count": 0.0, "step": 1670, "text_loss": 0.43294376134872437 @@ -15882,13 +15882,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.09619140625, + "grad_norm": 0.09130859375, "learning_rate": 0.0009675165635872715, - "loss": 0.031, + "loss": 0.0306, "macro_f1": 0.44705885648727417, "num_tokens": 2697806.0, "repeat_count": 0.0, - "routers_loss": 0.05615650862455368, + "routers_loss": 0.05372785031795502, "skip_count": 3.0, "step": 1672, "text_loss": 0.1614082306623459 @@ -15901,13 +15901,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.11669921875, "learning_rate": 0.0009674067317288, - "loss": 0.0301, + "loss": 0.0296, "macro_f1": 0.6666666865348816, "num_tokens": 2700529.0, "repeat_count": 1.0, - "routers_loss": 0.012819192372262478, + "routers_loss": 0.018131591379642487, "skip_count": 0.0, "step": 1674, "text_loss": 0.2093173861503601 @@ -15920,13 +15920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.0009672967207591869, - "loss": 0.0253, + "loss": 0.0257, "macro_f1": 0.3272727429866791, "num_tokens": 2703650.0, "repeat_count": 0.0, - "routers_loss": 0.07059332728385925, + "routers_loss": 0.0673515796661377, "skip_count": 1.0, "step": 1676, "text_loss": 0.3029400110244751 @@ -15939,13 +15939,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.11669921875, "learning_rate": 0.0009671865307205892, - "loss": 0.0198, + "loss": 0.021, "macro_f1": 0.32098767161369324, "num_tokens": 2707615.0, "repeat_count": 0.0, - "routers_loss": 0.029778441414237022, + "routers_loss": 0.03821169584989548, "skip_count": 1.0, "step": 1678, "text_loss": 0.2262786477804184 @@ -15958,13 +15958,13 @@ "f1_execute": 0.9756097793579102, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, - "grad_norm": 0.1416015625, + "grad_norm": 0.1396484375, "learning_rate": 0.0009670761616552315, - "loss": 0.0474, + "loss": 0.0465, "macro_f1": 0.9615669250488281, "num_tokens": 2710894.0, "repeat_count": 2.0, - "routers_loss": 0.04371272772550583, + "routers_loss": 0.042625464498996735, "skip_count": 6.0, "step": 1680, "text_loss": 0.29623574018478394 @@ -15977,13 +15977,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.189453125, + "grad_norm": 0.169921875, "learning_rate": 0.0009669656136054074, - "loss": 0.0293, + "loss": 0.0289, "macro_f1": 0.3333333432674408, "num_tokens": 2714330.0, "repeat_count": 0.0, - "routers_loss": 0.0033591394312679768, + "routers_loss": 0.0037571541033685207, "skip_count": 0.0, "step": 1682, "text_loss": 0.7510389089584351 @@ -15996,13 +15996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07421875, "learning_rate": 0.0009668548866134795, - "loss": 0.0259, + "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 2717176.0, "repeat_count": 0.0, - "routers_loss": 0.005085585173219442, + "routers_loss": 0.004142968449741602, "skip_count": 0.0, "step": 1684, "text_loss": 0.3273485600948334 @@ -16015,13 +16015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.07373046875, "learning_rate": 0.0009667439807218783, - "loss": 0.0243, + "loss": 0.0233, "macro_f1": 0.6666666865348816, "num_tokens": 2720628.0, "repeat_count": 0.0, - "routers_loss": 0.008569681085646152, + "routers_loss": 0.008753842674195766, "skip_count": 2.0, "step": 1686, "text_loss": 0.4314708709716797 @@ -16034,32 +16034,32 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.0732421875, "learning_rate": 0.0009666328959731033, - "loss": 0.022, + "loss": 0.0211, "macro_f1": 0.6603773832321167, "num_tokens": 2723739.0, "repeat_count": 1.0, - "routers_loss": 0.024587804451584816, + "routers_loss": 0.022674910724163055, "skip_count": 1.0, "step": 1688, "text_loss": 0.25734150409698486 }, { "acc_repeat": 0.0, - "acc_skip": 0.3333333432674408, - "avg_layers": 27.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, "epoch": 7.934546521866745, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, - "f1_skip": 0.5, - "grad_norm": 0.169921875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, "learning_rate": 0.0009665216324097222, - "loss": 0.0332, - "macro_f1": 0.4871794879436493, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, "num_tokens": 2726644.0, "repeat_count": 0.0, - "routers_loss": 0.037516288459300995, + "routers_loss": 0.03932750225067139, "skip_count": 3.0, "step": 1690, "text_loss": 0.24511034786701202 @@ -16072,13 +16072,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.09765625, "learning_rate": 0.0009664101900743714, - "loss": 0.0262, + "loss": 0.0255, "macro_f1": 0.3272727429866791, "num_tokens": 2729662.0, "repeat_count": 0.0, - "routers_loss": 0.01287431176751852, + "routers_loss": 0.012672754004597664, "skip_count": 1.0, "step": 1692, "text_loss": 0.39431414008140564 @@ -16091,13 +16091,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07763671875, + "grad_norm": 0.076171875, "learning_rate": 0.000966298569009756, - "loss": 0.0227, + "loss": 0.0231, "macro_f1": 0.5492662787437439, "num_tokens": 2732578.0, "repeat_count": 0.0, - "routers_loss": 0.015499880537390709, + "routers_loss": 0.01548632513731718, "skip_count": 2.0, "step": 1694, "text_loss": 0.12439999729394913 @@ -16110,13 +16110,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009661867692586494, - "loss": 0.0144, + "loss": 0.0153, "macro_f1": 0.32098764181137085, "num_tokens": 2735887.0, "repeat_count": 0.0, - "routers_loss": 0.049878787249326706, + "routers_loss": 0.05622401833534241, "skip_count": 2.0, "step": 1696, "text_loss": 0.29024389386177063 @@ -16129,13 +16129,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10009765625, + "grad_norm": 0.087890625, "learning_rate": 0.0009660747908638933, - "loss": 0.0206, + "loss": 0.0205, "macro_f1": 0.3272727429866791, "num_tokens": 2739293.0, "repeat_count": 0.0, - "routers_loss": 0.04108169302344322, + "routers_loss": 0.041060201823711395, "skip_count": 1.0, "step": 1698, "text_loss": 0.39461007714271545 @@ -16148,13 +16148,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1728515625, + "grad_norm": 0.1767578125, "learning_rate": 0.0009659626338683981, - "loss": 0.0367, + "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2742468.0, "repeat_count": 0.0, - "routers_loss": 0.007651917636394501, + "routers_loss": 0.007251353468745947, "skip_count": 0.0, "step": 1700, "text_loss": 0.2751767635345459 @@ -16167,13 +16167,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.07763671875, "learning_rate": 0.0009658502983151427, - "loss": 0.0182, + "loss": 0.0186, "macro_f1": 0.3272727429866791, "num_tokens": 2745123.0, "repeat_count": 0.0, - "routers_loss": 0.015448091551661491, + "routers_loss": 0.012847424484789371, "skip_count": 1.0, "step": 1702, "text_loss": 0.4756404757499695 @@ -16186,13 +16186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1279296875, + "grad_norm": 0.11767578125, "learning_rate": 0.0009657377842471742, - "loss": 0.0324, + "loss": 0.0313, "macro_f1": 0.6666666865348816, "num_tokens": 2748016.0, "repeat_count": 0.0, - "routers_loss": 0.009139287285506725, + "routers_loss": 0.007060411386191845, "skip_count": 1.0, "step": 1704, "text_loss": 0.9571210145950317 @@ -16205,13 +16205,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0869140625, + "grad_norm": 0.10009765625, "learning_rate": 0.0009656250917076081, - "loss": 0.0191, + "loss": 0.0188, "macro_f1": 0.5492662787437439, "num_tokens": 2750717.0, "repeat_count": 0.0, - "routers_loss": 0.015412120148539543, + "routers_loss": 0.016748681664466858, "skip_count": 2.0, "step": 1706, "text_loss": 0.14542843401432037 @@ -16224,13 +16224,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.060302734375, "learning_rate": 0.0009655122207396285, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2753635.0, "repeat_count": 0.0, - "routers_loss": 0.012735052965581417, + "routers_loss": 0.013607042841613293, "skip_count": 0.0, "step": 1708, "text_loss": 0.21836471557617188 @@ -16243,13 +16243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009653991713864878, - "loss": 0.0192, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2756643.0, "repeat_count": 0.0, - "routers_loss": 0.00114025070797652, + "routers_loss": 0.0012097888393327594, "skip_count": 0.0, "step": 1710, "text_loss": 0.635187029838562 @@ -16262,13 +16262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1396484375, + "grad_norm": 0.1171875, "learning_rate": 0.0009652859436915066, - "loss": 0.0243, + "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2759432.0, "repeat_count": 0.0, - "routers_loss": 0.006401443853974342, + "routers_loss": 0.006196760106831789, "skip_count": 0.0, "step": 1712, "text_loss": 0.5629420876502991 @@ -16281,13 +16281,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009651725376980743, - "loss": 0.0185, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 2762538.0, "repeat_count": 0.0, - "routers_loss": 0.004316259175539017, + "routers_loss": 0.0042513771913945675, "skip_count": 0.0, "step": 1714, "text_loss": 0.39522525668144226 @@ -16300,13 +16300,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.125, + "grad_norm": 0.1494140625, "learning_rate": 0.0009650589534496479, - "loss": 0.0201, + "loss": 0.0194, "macro_f1": 0.8194444179534912, "num_tokens": 2765571.0, "repeat_count": 2.0, - "routers_loss": 0.043461959809064865, + "routers_loss": 0.03596706688404083, "skip_count": 3.0, "step": 1716, "text_loss": 0.6252416968345642 @@ -16319,13 +16319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04833984375, "learning_rate": 0.0009649451909897532, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2769206.0, "repeat_count": 0.0, - "routers_loss": 0.0024530428927391768, + "routers_loss": 0.0025788163766264915, "skip_count": 0.0, "step": 1718, "text_loss": 0.8851634860038757 @@ -16338,13 +16338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.10791015625, "learning_rate": 0.0009648312503619843, - "loss": 0.026, + "loss": 0.0265, "macro_f1": 0.3333333432674408, "num_tokens": 2772488.0, "repeat_count": 0.0, - "routers_loss": 0.0046626063995063305, + "routers_loss": 0.004443451762199402, "skip_count": 0.0, "step": 1720, "text_loss": 0.8568580746650696 @@ -16357,13 +16357,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1513671875, + "grad_norm": 0.1552734375, "learning_rate": 0.0009647171316100034, - "loss": 0.0257, + "loss": 0.0265, "macro_f1": 0.9265305995941162, "num_tokens": 2776482.0, "repeat_count": 1.0, - "routers_loss": 0.02480102889239788, + "routers_loss": 0.022948263213038445, "skip_count": 3.0, "step": 1722, "text_loss": 0.13431036472320557 @@ -16376,13 +16376,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.126953125, + "grad_norm": 0.1572265625, "learning_rate": 0.0009646028347775409, - "loss": 0.02, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 2778966.0, "repeat_count": 0.0, - "routers_loss": 0.012629947625100613, + "routers_loss": 0.011328035034239292, "skip_count": 1.0, "step": 1724, "text_loss": 0.2085491120815277 @@ -16395,13 +16395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.08984375, "learning_rate": 0.0009644883599083958, "loss": 0.0238, "macro_f1": 0.3333333432674408, "num_tokens": 2781968.0, "repeat_count": 0.0, - "routers_loss": 0.0024127380456775427, + "routers_loss": 0.002208018908277154, "skip_count": 0.0, "step": 1726, "text_loss": 0.4948323965072632 @@ -16414,13 +16414,13 @@ "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009643737070464349, - "loss": 0.0162, + "loss": 0.0158, "macro_f1": 0.6470588445663452, "num_tokens": 2784666.0, "repeat_count": 1.0, - "routers_loss": 0.0415453165769577, + "routers_loss": 0.04391832649707794, "skip_count": 2.0, "step": 1728, "text_loss": 0.39060094952583313 @@ -16433,13 +16433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046630859375, "learning_rate": 0.0009642588762355935, - "loss": 0.0211, + "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 2787558.0, "repeat_count": 0.0, - "routers_loss": 0.0056681083515286446, + "routers_loss": 0.004497280344367027, "skip_count": 1.0, "step": 1730, "text_loss": 0.34908708930015564 @@ -16452,13 +16452,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.07275390625, "learning_rate": 0.0009641438675198748, - "loss": 0.0189, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2790474.0, "repeat_count": 0.0, - "routers_loss": 0.006391602102667093, + "routers_loss": 0.00583475548774004, "skip_count": 0.0, "step": 1732, "text_loss": 0.5720033049583435 @@ -16471,13 +16471,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009640286809433508, - "loss": 0.0229, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2793272.0, "repeat_count": 0.0, - "routers_loss": 0.007466991897672415, + "routers_loss": 0.007826375775039196, "skip_count": 0.0, "step": 1734, "text_loss": 0.32181721925735474 @@ -16490,13 +16490,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009639133165501606, - "loss": 0.0197, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2797726.0, "repeat_count": 0.0, - "routers_loss": 0.001953453291207552, + "routers_loss": 0.0019055595621466637, "skip_count": 0.0, "step": 1736, "text_loss": 0.620936393737793 @@ -16509,13 +16509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.134765625, "learning_rate": 0.0009637977743845124, - "loss": 0.0223, + "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2800706.0, "repeat_count": 0.0, - "routers_loss": 0.003612719476222992, + "routers_loss": 0.0028302327264100313, "skip_count": 0.0, "step": 1738, "text_loss": 0.6473138332366943 @@ -16528,13 +16528,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009636820544906823, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 2803847.0, "repeat_count": 1.0, - "routers_loss": 0.009977150708436966, + "routers_loss": 0.01105099730193615, "skip_count": 2.0, "step": 1740, "text_loss": 0.4401201903820038 @@ -16547,13 +16547,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.10791015625, + "grad_norm": 0.1455078125, "learning_rate": 0.0009635661569130141, "loss": 0.0195, "macro_f1": 0.5934640765190125, "num_tokens": 2807235.0, "repeat_count": 0.0, - "routers_loss": 0.026468059048056602, + "routers_loss": 0.02619045600295067, "skip_count": 3.0, "step": 1742, "text_loss": 0.459264874458313 @@ -16566,13 +16566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06396484375, "learning_rate": 0.0009634500816959202, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2810396.0, "repeat_count": 0.0, - "routers_loss": 0.00849854201078415, + "routers_loss": 0.007915694266557693, "skip_count": 2.0, "step": 1744, "text_loss": 0.5084020495414734 @@ -16585,13 +16585,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.177734375, + "grad_norm": 0.1748046875, "learning_rate": 0.0009633338288838805, - "loss": 0.0275, + "loss": 0.0271, "macro_f1": 0.5492662787437439, "num_tokens": 2813215.0, "repeat_count": 2.0, - "routers_loss": 0.08082596957683563, + "routers_loss": 0.08364596217870712, "skip_count": 0.0, "step": 1746, "text_loss": 0.27681824564933777 @@ -16604,13 +16604,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.046142578125, + "grad_norm": 0.051025390625, "learning_rate": 0.0009632173985214438, - "loss": 0.015, + "loss": 0.0156, "macro_f1": 0.8817967176437378, "num_tokens": 2816452.0, "repeat_count": 3.0, - "routers_loss": 0.029500717297196388, + "routers_loss": 0.028805451467633247, "skip_count": 2.0, "step": 1748, "text_loss": 0.4678419530391693 @@ -16623,13 +16623,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0625, "learning_rate": 0.000963100790653226, - "loss": 0.0183, + "loss": 0.0188, "macro_f1": 0.3272727429866791, "num_tokens": 2819364.0, "repeat_count": 0.0, - "routers_loss": 0.025238536298274994, + "routers_loss": 0.03056817688047886, "skip_count": 1.0, "step": 1750, "text_loss": 0.3078109920024872 @@ -16642,13 +16642,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.06689453125, "learning_rate": 0.0009629840053239116, - "loss": 0.0204, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2823469.0, "repeat_count": 0.0, - "routers_loss": 0.002069319598376751, + "routers_loss": 0.0019477814203128219, "skip_count": 0.0, "step": 1752, "text_loss": 0.45501336455345154 @@ -16661,13 +16661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.057373046875, "learning_rate": 0.000962867042578253, - "loss": 0.0169, + "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2826716.0, "repeat_count": 0.0, - "routers_loss": 0.002853946527466178, + "routers_loss": 0.0032963966950774193, "skip_count": 0.0, "step": 1754, "text_loss": 0.49234694242477417 @@ -16680,13 +16680,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0947265625, "learning_rate": 0.0009627499024610707, - "loss": 0.0236, + "loss": 0.0239, "macro_f1": 0.3272727429866791, "num_tokens": 2829733.0, "repeat_count": 0.0, - "routers_loss": 0.0100983502343297, + "routers_loss": 0.010289114899933338, "skip_count": 1.0, "step": 1756, "text_loss": 0.22335539758205414 @@ -16699,13 +16699,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09228515625, + "grad_norm": 0.0888671875, "learning_rate": 0.0009626325850172527, - "loss": 0.0173, + "loss": 0.0174, "macro_f1": 0.3272727429866791, "num_tokens": 2833350.0, "repeat_count": 0.0, - "routers_loss": 0.031218983232975006, + "routers_loss": 0.03249066323041916, "skip_count": 1.0, "step": 1758, "text_loss": 0.6581931114196777 @@ -16718,13 +16718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0703125, "learning_rate": 0.0009625150902917555, - "loss": 0.019, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 2836558.0, "repeat_count": 0.0, - "routers_loss": 0.010347879491746426, + "routers_loss": 0.00870000571012497, "skip_count": 0.0, "step": 1760, "text_loss": 0.22938725352287292 @@ -16737,13 +16737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1455078125, + "grad_norm": 0.1259765625, "learning_rate": 0.0009623974183296031, - "loss": 0.0193, + "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2840560.0, "repeat_count": 0.0, - "routers_loss": 0.007768871728330851, + "routers_loss": 0.007767196744680405, "skip_count": 0.0, "step": 1762, "text_loss": 0.24473799765110016 @@ -16756,13 +16756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.09228515625, "learning_rate": 0.0009622795691758876, - "loss": 0.0253, + "loss": 0.0244, "macro_f1": 0.3333333432674408, "num_tokens": 2843548.0, "repeat_count": 0.0, - "routers_loss": 0.002887974726036191, + "routers_loss": 0.0021693643648177385, "skip_count": 0.0, "step": 1764, "text_loss": 0.3084608018398285 @@ -16777,11 +16777,11 @@ "f1_skip": 0.0, "grad_norm": 0.0498046875, "learning_rate": 0.0009621615428757693, - "loss": 0.0147, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 2847076.0, "repeat_count": 0.0, - "routers_loss": 0.0027294005267322063, + "routers_loss": 0.0024727333802729845, "skip_count": 0.0, "step": 1766, "text_loss": 0.5251734852790833 @@ -16794,13 +16794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.000962043339474476, - "loss": 0.0193, + "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 2849751.0, "repeat_count": 0.0, - "routers_loss": 0.00543541694059968, + "routers_loss": 0.005174890160560608, "skip_count": 0.0, "step": 1768, "text_loss": 0.4410129189491272 @@ -16813,13 +16813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06103515625, "learning_rate": 0.0009619249590173032, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2853916.0, "repeat_count": 0.0, - "routers_loss": 0.006514009553939104, + "routers_loss": 0.006785830482840538, "skip_count": 2.0, "step": 1770, "text_loss": 0.550076425075531 @@ -16832,13 +16832,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.06396484375, + "grad_norm": 0.06591796875, "learning_rate": 0.0009618064015496149, - "loss": 0.019, + "loss": 0.0192, "macro_f1": 0.5934640765190125, "num_tokens": 2857372.0, "repeat_count": 0.0, - "routers_loss": 0.02333846502006054, + "routers_loss": 0.021370256319642067, "skip_count": 3.0, "step": 1772, "text_loss": 0.1988629847764969 @@ -16851,13 +16851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.072265625, "learning_rate": 0.0009616876671168423, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.6666666865348816, "num_tokens": 2861028.0, "repeat_count": 0.0, - "routers_loss": 0.004471905063837767, + "routers_loss": 0.004313841462135315, "skip_count": 1.0, "step": 1774, "text_loss": 0.42581331729888916 @@ -16870,13 +16870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1103515625, "learning_rate": 0.0009615687557644847, - "loss": 0.0261, + "loss": 0.0268, "macro_f1": 0.3333333432674408, "num_tokens": 2864847.0, "repeat_count": 0.0, - "routers_loss": 0.0024362702388316393, + "routers_loss": 0.0025742491707205772, "skip_count": 0.0, "step": 1776, "text_loss": 0.46510905027389526 @@ -16889,13 +16889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.140625, + "grad_norm": 0.1494140625, "learning_rate": 0.0009614496675381093, - "loss": 0.0116, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2867392.0, "repeat_count": 0.0, - "routers_loss": 0.0021166049409657717, + "routers_loss": 0.0016813480760902166, "skip_count": 0.0, "step": 1778, "text_loss": 0.5922174453735352 @@ -16908,13 +16908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.0810546875, "learning_rate": 0.0009613304024833507, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2871273.0, "repeat_count": 0.0, - "routers_loss": 0.004722296260297298, + "routers_loss": 0.004948933608829975, "skip_count": 0.0, "step": 1780, "text_loss": 0.6776977777481079 @@ -16929,11 +16929,11 @@ "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0009612109606459117, - "loss": 0.0199, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 2874172.0, "repeat_count": 1.0, - "routers_loss": 0.014188882894814014, + "routers_loss": 0.016950147226452827, "skip_count": 2.0, "step": 1782, "text_loss": 0.48758944869041443 @@ -16946,13 +16946,13 @@ "f1_execute": 0.9599999785423279, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, - "grad_norm": 0.076171875, + "grad_norm": 0.08251953125, "learning_rate": 0.0009610913420715623, - "loss": 0.0241, + "loss": 0.0237, "macro_f1": 0.7644444704055786, "num_tokens": 2877528.0, "repeat_count": 2.0, - "routers_loss": 0.04599560424685478, + "routers_loss": 0.04880943149328232, "skip_count": 1.0, "step": 1784, "text_loss": 0.4404778480529785 @@ -16965,13 +16965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06201171875, "learning_rate": 0.0009609715468061411, - "loss": 0.0216, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2880627.0, "repeat_count": 0.0, - "routers_loss": 0.004942454397678375, + "routers_loss": 0.004678630735725164, "skip_count": 0.0, "step": 1786, "text_loss": 0.7295402884483337 @@ -16984,13 +16984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.07958984375, "learning_rate": 0.0009608515748955535, - "loss": 0.021, + "loss": 0.0205, "macro_f1": 0.3333333432674408, "num_tokens": 2883333.0, "repeat_count": 0.0, - "routers_loss": 0.0020542226266115904, + "routers_loss": 0.0026695074047893286, "skip_count": 0.0, "step": 1788, "text_loss": 0.9697831273078918 @@ -17003,13 +17003,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1171875, + "grad_norm": 0.107421875, "learning_rate": 0.000960731426385773, - "loss": 0.0155, + "loss": 0.0157, "macro_f1": 0.4871794879436493, "num_tokens": 2887444.0, "repeat_count": 0.0, - "routers_loss": 0.0397041030228138, + "routers_loss": 0.029743613675236702, "skip_count": 2.0, "step": 1790, "text_loss": 0.4737568199634552 @@ -17022,13 +17022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10107421875, "learning_rate": 0.0009606111013228407, - "loss": 0.0204, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2890221.0, "repeat_count": 0.0, - "routers_loss": 0.0017490010941401124, + "routers_loss": 0.0016153788892552257, "skip_count": 0.0, "step": 1792, "text_loss": 0.6693558096885681 @@ -17041,13 +17041,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009604905997528655, - "loss": 0.021, + "loss": 0.02, "macro_f1": 0.3272727429866791, "num_tokens": 2893262.0, "repeat_count": 0.0, - "routers_loss": 0.023590171709656715, + "routers_loss": 0.01965433731675148, "skip_count": 1.0, "step": 1794, "text_loss": 0.45227760076522827 @@ -17060,13 +17060,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.08642578125, "learning_rate": 0.0009603699217220239, - "loss": 0.0125, + "loss": 0.0117, "macro_f1": 0.6601307392120361, "num_tokens": 2896823.0, "repeat_count": 1.0, - "routers_loss": 0.02458076737821102, + "routers_loss": 0.024017298594117165, "skip_count": 2.0, "step": 1796, "text_loss": 0.48865509033203125 @@ -17079,13 +17079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009602490672765597, - "loss": 0.019, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2899707.0, "repeat_count": 0.0, - "routers_loss": 0.0014341498026624322, + "routers_loss": 0.0012420224957168102, "skip_count": 0.0, "step": 1798, "text_loss": 0.43292415142059326 @@ -17098,13 +17098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07861328125, "learning_rate": 0.0009601280364627848, - "loss": 0.02, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2902795.0, "repeat_count": 0.0, - "routers_loss": 0.00213223067112267, + "routers_loss": 0.0020389219280332327, "skip_count": 0.0, "step": 1800, "text_loss": 0.41021591424942017 @@ -17117,13 +17117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009600068293270783, - "loss": 0.0147, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2905769.0, "repeat_count": 0.0, - "routers_loss": 0.0027340995147824287, + "routers_loss": 0.002006303984671831, "skip_count": 0.0, "step": 1802, "text_loss": 0.46892106533050537 @@ -17136,32 +17136,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08740234375, "learning_rate": 0.000959885445915887, - "loss": 0.0172, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2909475.0, "repeat_count": 0.0, - "routers_loss": 0.0035587961319833994, + "routers_loss": 0.003734810510650277, "skip_count": 0.0, "step": 1804, "text_loss": 0.45364710688591003 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 8.479013795127678, - "f1_execute": 0.9615384340286255, - "f1_repeat": 0.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009597638862757254, - "loss": 0.0187, - "macro_f1": 0.5427350401878357, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, "num_tokens": 2914348.0, "repeat_count": 1.0, - "routers_loss": 0.04446055367588997, + "routers_loss": 0.038971323519945145, "skip_count": 2.0, "step": 1806, "text_loss": 0.42913779616355896 @@ -17174,13 +17174,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.080078125, "learning_rate": 0.0009596421504531751, - "loss": 0.0244, + "loss": 0.0249, "macro_f1": 0.3272727429866791, "num_tokens": 2917467.0, "repeat_count": 1.0, - "routers_loss": 0.05095123499631882, + "routers_loss": 0.04800829663872719, "skip_count": 0.0, "step": 1808, "text_loss": 0.17332297563552856 @@ -17193,13 +17193,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1083984375, "learning_rate": 0.0009595202384948858, - "loss": 0.0232, + "loss": 0.0227, "macro_f1": 0.6666666865348816, "num_tokens": 2920223.0, "repeat_count": 1.0, - "routers_loss": 0.008440068922936916, + "routers_loss": 0.009164143353700638, "skip_count": 0.0, "step": 1810, "text_loss": 0.33740702271461487 @@ -17212,13 +17212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.0947265625, "learning_rate": 0.0009593981504475742, - "loss": 0.0273, + "loss": 0.0275, "macro_f1": 0.6666666865348816, "num_tokens": 2923780.0, "repeat_count": 0.0, - "routers_loss": 0.012230116873979568, + "routers_loss": 0.011236993595957756, "skip_count": 2.0, "step": 1812, "text_loss": 0.1609916388988495 @@ -17231,13 +17231,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1005859375, + "grad_norm": 0.10595703125, "learning_rate": 0.0009592758863580248, - "loss": 0.026, + "loss": 0.0259, "macro_f1": 0.5492662787437439, "num_tokens": 2926259.0, "repeat_count": 0.0, - "routers_loss": 0.017307188361883163, + "routers_loss": 0.019026532769203186, "skip_count": 2.0, "step": 1814, "text_loss": 0.6460903882980347 @@ -17250,13 +17250,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.099609375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009591534462730894, - "loss": 0.0215, + "loss": 0.0206, "macro_f1": 0.5492662787437439, "num_tokens": 2929173.0, "repeat_count": 2.0, - "routers_loss": 0.07191162556409836, + "routers_loss": 0.0608333982527256, "skip_count": 0.0, "step": 1816, "text_loss": 0.476126492023468 @@ -17269,13 +17269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.000959030830239687, - "loss": 0.0182, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 2932703.0, "repeat_count": 0.0, - "routers_loss": 0.008753604255616665, + "routers_loss": 0.0093300249427557, "skip_count": 0.0, "step": 1818, "text_loss": 0.5471875667572021 @@ -17288,13 +17288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.19921875, + "grad_norm": 0.2001953125, "learning_rate": 0.0009589080383048048, - "loss": 0.0233, + "loss": 0.0235, "macro_f1": 0.3333333432674408, "num_tokens": 2936195.0, "repeat_count": 0.0, - "routers_loss": 0.008390828967094421, + "routers_loss": 0.010434109717607498, "skip_count": 0.0, "step": 1820, "text_loss": 0.5068115592002869 @@ -17307,13 +17307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.0986328125, "learning_rate": 0.0009587850705154964, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 2939412.0, "repeat_count": 0.0, - "routers_loss": 0.005617359187453985, + "routers_loss": 0.004347751382738352, "skip_count": 0.0, "step": 1822, "text_loss": 0.4241984784603119 @@ -17326,13 +17326,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.0859375, "learning_rate": 0.0009586619269188836, - "loss": 0.0227, + "loss": 0.0224, "macro_f1": 0.32098767161369324, "num_tokens": 2942318.0, "repeat_count": 0.0, - "routers_loss": 0.0346846878528595, + "routers_loss": 0.034238871186971664, "skip_count": 1.0, "step": 1824, "text_loss": 0.2328975349664688 @@ -17345,32 +17345,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.11181640625, "learning_rate": 0.0009585386075621553, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 2945731.0, "repeat_count": 0.0, - "routers_loss": 0.006601692643016577, + "routers_loss": 0.006097695790231228, "skip_count": 0.0, "step": 1826, "text_loss": 0.22816994786262512 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 8.582330496037569, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, "learning_rate": 0.0009584151124925676, - "loss": 0.0207, - "macro_f1": 0.6666666865348816, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, "num_tokens": 2948944.0, "repeat_count": 0.0, - "routers_loss": 0.0065619745291769505, + "routers_loss": 0.007790776435285807, "skip_count": 1.0, "step": 1828, "text_loss": 0.5009413361549377 @@ -17383,13 +17383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.07275390625, "learning_rate": 0.0009582914417574438, - "loss": 0.0149, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2951723.0, "repeat_count": 0.0, - "routers_loss": 0.011109639890491962, + "routers_loss": 0.009144559502601624, "skip_count": 2.0, "step": 1830, "text_loss": 0.1402502954006195 @@ -17404,11 +17404,11 @@ "f1_skip": 0.0, "grad_norm": 0.06201171875, "learning_rate": 0.0009581675954041751, - "loss": 0.0167, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2954726.0, "repeat_count": 1.0, - "routers_loss": 0.008432094007730484, + "routers_loss": 0.006593191530555487, "skip_count": 0.0, "step": 1832, "text_loss": 0.4871736466884613 @@ -17421,13 +17421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009580435734802196, - "loss": 0.0208, + "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 2957853.0, "repeat_count": 0.0, - "routers_loss": 0.011518111452460289, + "routers_loss": 0.01241068821400404, "skip_count": 0.0, "step": 1834, "text_loss": 0.30100154876708984 @@ -17440,13 +17440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.146484375, + "grad_norm": 0.1298828125, "learning_rate": 0.0009579193760331027, - "loss": 0.0211, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 2960783.0, "repeat_count": 0.0, - "routers_loss": 0.0026744187343865633, + "routers_loss": 0.002219218760728836, "skip_count": 0.0, "step": 1836, "text_loss": 0.4961516559123993 @@ -17459,13 +17459,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09619140625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009577950031104169, - "loss": 0.0165, + "loss": 0.0166, "macro_f1": 0.6601307392120361, "num_tokens": 2963328.0, "repeat_count": 1.0, - "routers_loss": 0.028107430785894394, + "routers_loss": 0.029363535344600677, "skip_count": 2.0, "step": 1838, "text_loss": 0.42814353108406067 @@ -17478,13 +17478,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.095703125, + "grad_norm": 0.1044921875, "learning_rate": 0.0009576704547598226, - "loss": 0.0263, + "loss": 0.0257, "macro_f1": 0.7795917987823486, "num_tokens": 2966108.0, "repeat_count": 1.0, - "routers_loss": 0.060007549822330475, + "routers_loss": 0.0579402856528759, "skip_count": 4.0, "step": 1840, "text_loss": 0.20523512363433838 @@ -17497,13 +17497,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0625, "learning_rate": 0.0009575457310290463, "loss": 0.0121, "macro_f1": 0.3272727429866791, "num_tokens": 2969137.0, "repeat_count": 0.0, - "routers_loss": 0.01074182614684105, + "routers_loss": 0.008810589089989662, "skip_count": 0.0, "step": 1842, "text_loss": 0.6199528574943542 @@ -17516,13 +17516,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009574208319658831, - "loss": 0.0213, + "loss": 0.0208, "macro_f1": 0.6666666865348816, "num_tokens": 2972407.0, "repeat_count": 0.0, - "routers_loss": 0.0019638657104223967, + "routers_loss": 0.0012295129708945751, "skip_count": 1.0, "step": 1844, "text_loss": 0.66938316822052 @@ -17535,13 +17535,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.1572265625, + "grad_norm": 0.1474609375, "learning_rate": 0.000957295757618194, - "loss": 0.0156, + "loss": 0.0152, "macro_f1": 0.4871794879436493, "num_tokens": 2976045.0, "repeat_count": 0.0, - "routers_loss": 0.06953249871730804, + "routers_loss": 0.06162935495376587, "skip_count": 2.0, "step": 1846, "text_loss": 0.5381782650947571 @@ -17554,13 +17554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009571705080339079, - "loss": 0.0154, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 2979025.0, "repeat_count": 0.0, - "routers_loss": 0.003563052974641323, + "routers_loss": 0.003950524143874645, "skip_count": 0.0, "step": 1848, "text_loss": 0.5831671357154846 @@ -17573,13 +17573,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11376953125, "learning_rate": 0.0009570450832610208, - "loss": 0.0216, + "loss": 0.0209, "macro_f1": 0.3333333432674408, "num_tokens": 2982276.0, "repeat_count": 0.0, - "routers_loss": 0.010409255512058735, + "routers_loss": 0.010354886762797832, "skip_count": 0.0, "step": 1850, "text_loss": 0.27448201179504395 @@ -17592,13 +17592,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009569194833475956, - "loss": 0.0195, + "loss": 0.0199, "macro_f1": 0.3272727429866791, "num_tokens": 2985691.0, "repeat_count": 0.0, - "routers_loss": 0.009769548662006855, + "routers_loss": 0.010167439468204975, "skip_count": 0.0, "step": 1852, "text_loss": 0.5264663696289062 @@ -17611,13 +17611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.1328125, "learning_rate": 0.0009567937083417624, - "loss": 0.0184, + "loss": 0.0194, "macro_f1": 0.3272727429866791, "num_tokens": 2989126.0, "repeat_count": 0.0, - "routers_loss": 0.036616452038288116, + "routers_loss": 0.0371871180832386, "skip_count": 1.0, "step": 1854, "text_loss": 0.2008018046617508 @@ -17630,13 +17630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.0673828125, "learning_rate": 0.0009566677582917185, - "loss": 0.0192, + "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2992814.0, "repeat_count": 0.0, - "routers_loss": 0.009581349790096283, + "routers_loss": 0.010190588422119617, "skip_count": 0.0, "step": 1856, "text_loss": 0.749717116355896 @@ -17649,13 +17649,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.080078125, "learning_rate": 0.0009565416332457282, - "loss": 0.0138, + "loss": 0.0132, "macro_f1": 0.6538461446762085, "num_tokens": 2995729.0, "repeat_count": 1.0, - "routers_loss": 0.02330300398170948, + "routers_loss": 0.022285036742687225, "skip_count": 1.0, "step": 1858, "text_loss": 0.5870219469070435 @@ -17668,13 +17668,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0771484375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009564153332521228, - "loss": 0.0226, + "loss": 0.0224, "macro_f1": 0.3272727429866791, "num_tokens": 2998812.0, "repeat_count": 0.0, - "routers_loss": 0.011985735036432743, + "routers_loss": 0.011050296947360039, "skip_count": 1.0, "step": 1860, "text_loss": 0.8444408774375916 @@ -17687,13 +17687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06005859375, "learning_rate": 0.0009562888583593005, - "loss": 0.0162, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3001799.0, "repeat_count": 0.0, - "routers_loss": 0.005997250322252512, + "routers_loss": 0.007125461008399725, "skip_count": 0.0, "step": 1862, "text_loss": 0.41510361433029175 @@ -17706,13 +17706,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009561622086157272, - "loss": 0.0243, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3005088.0, "repeat_count": 0.0, - "routers_loss": 0.004814761225134134, + "routers_loss": 0.0049054501578211784, "skip_count": 0.0, "step": 1864, "text_loss": 0.3801248073577881 @@ -17725,13 +17725,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.054443359375, "learning_rate": 0.000956035384069935, - "loss": 0.0242, + "loss": 0.0238, "macro_f1": 1.0, "num_tokens": 3008178.0, "repeat_count": 1.0, - "routers_loss": 0.004750931169837713, + "routers_loss": 0.005162427201867104, "skip_count": 1.0, "step": 1866, "text_loss": 0.2687684893608093 @@ -17744,13 +17744,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1123046875, + "grad_norm": 0.10400390625, "learning_rate": 0.0009559083847705233, - "loss": 0.0216, + "loss": 0.0214, "macro_f1": 0.3272727429866791, "num_tokens": 3010923.0, "repeat_count": 0.0, - "routers_loss": 0.038251202553510666, + "routers_loss": 0.028984658420085907, "skip_count": 1.0, "step": 1868, "text_loss": 0.6277349591255188 @@ -17763,13 +17763,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06640625, + "grad_norm": 0.08349609375, "learning_rate": 0.0009557812107661584, - "loss": 0.0204, + "loss": 0.0208, "macro_f1": 1.0, "num_tokens": 3015030.0, "repeat_count": 1.0, - "routers_loss": 0.010951942764222622, + "routers_loss": 0.012200530618429184, "skip_count": 1.0, "step": 1870, "text_loss": 0.6293368339538574 @@ -17782,13 +17782,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.11962890625, "learning_rate": 0.0009556538621055739, - "loss": 0.0265, + "loss": 0.0268, "macro_f1": 0.3272727429866791, "num_tokens": 3019067.0, "repeat_count": 0.0, - "routers_loss": 0.06582094728946686, + "routers_loss": 0.06365182995796204, "skip_count": 1.0, "step": 1872, "text_loss": 0.39046618342399597 @@ -17796,18 +17796,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 8.798356325212797, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.12353515625, + "f1_skip": 1.0, + "grad_norm": 0.115234375, "learning_rate": 0.0009555263388375699, - "loss": 0.0143, - "macro_f1": 0.5492662787437439, + "loss": 0.014, + "macro_f1": 0.6666666865348816, "num_tokens": 3022166.0, "repeat_count": 0.0, - "routers_loss": 0.008920271880924702, + "routers_loss": 0.0041703456081449986, "skip_count": 1.0, "step": 1874, "text_loss": 0.42232340574264526 @@ -17820,13 +17820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1220703125, + "grad_norm": 0.11572265625, "learning_rate": 0.0009553986410110134, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3025865.0, "repeat_count": 0.0, - "routers_loss": 0.006444344762712717, + "routers_loss": 0.005841755773872137, "skip_count": 0.0, "step": 1876, "text_loss": 0.37600573897361755 @@ -17839,13 +17839,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.09228515625, "learning_rate": 0.0009552707686748388, - "loss": 0.022, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3029950.0, "repeat_count": 0.0, - "routers_loss": 0.05197767913341522, + "routers_loss": 0.05165952071547508, "skip_count": 1.0, "step": 1878, "text_loss": 0.33717799186706543 @@ -17858,13 +17858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0849609375, "learning_rate": 0.0009551427218780467, - "loss": 0.0224, + "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 3033649.0, "repeat_count": 0.0, - "routers_loss": 0.017570581287145615, + "routers_loss": 0.020680008456110954, "skip_count": 2.0, "step": 1880, "text_loss": 0.5011783838272095 @@ -17877,13 +17877,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.173828125, + "grad_norm": 0.15625, "learning_rate": 0.0009550145006697048, - "loss": 0.0225, + "loss": 0.0217, "macro_f1": 0.32098764181137085, "num_tokens": 3036847.0, "repeat_count": 0.0, - "routers_loss": 0.07106777280569077, + "routers_loss": 0.07626450061798096, "skip_count": 2.0, "step": 1882, "text_loss": 0.3066408336162567 @@ -17896,13 +17896,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009548861050989482, - "loss": 0.0139, + "loss": 0.0136, "macro_f1": 1.0, "num_tokens": 3040353.0, "repeat_count": 1.0, - "routers_loss": 0.009862381964921951, + "routers_loss": 0.010884666815400124, "skip_count": 1.0, "step": 1884, "text_loss": 0.49779415130615234 @@ -17915,13 +17915,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0908203125, "learning_rate": 0.0009547575352149778, - "loss": 0.0209, + "loss": 0.0213, "macro_f1": 0.6666666865348816, "num_tokens": 3043504.0, "repeat_count": 0.0, - "routers_loss": 0.006928981747478247, + "routers_loss": 0.006704333238303661, "skip_count": 2.0, "step": 1886, "text_loss": 0.12284614145755768 @@ -17934,13 +17934,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09423828125, + "grad_norm": 0.11474609375, "learning_rate": 0.0009546287910670621, "loss": 0.0211, "macro_f1": 0.5427350401878357, "num_tokens": 3046422.0, "repeat_count": 1.0, - "routers_loss": 0.04788029566407204, + "routers_loss": 0.04799000173807144, "skip_count": 2.0, "step": 1888, "text_loss": 0.1824081838130951 @@ -17953,13 +17953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.1484375, "learning_rate": 0.0009544998727045361, - "loss": 0.0299, + "loss": 0.0306, "macro_f1": 0.3333333432674408, "num_tokens": 3049819.0, "repeat_count": 0.0, - "routers_loss": 0.008282946422696114, + "routers_loss": 0.008139612153172493, "skip_count": 0.0, "step": 1890, "text_loss": 0.18929053843021393 @@ -17972,32 +17972,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.09716796875, + "grad_norm": 0.09375, "learning_rate": 0.0009543707801768015, - "loss": 0.0181, + "loss": 0.0175, "macro_f1": 0.5934640765190125, "num_tokens": 3052766.0, "repeat_count": 0.0, - "routers_loss": 0.03251546248793602, + "routers_loss": 0.02966771461069584, "skip_count": 3.0, "step": 1892, "text_loss": 0.247748002409935 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 24.0, + "acc_skip": 0.5, + "avg_layers": 25.0, "epoch": 8.892280598767243, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06640625, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, "learning_rate": 0.0009542415135333267, - "loss": 0.0195, - "macro_f1": 0.542222261428833, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, "num_tokens": 3056427.0, "repeat_count": 0.0, - "routers_loss": 0.03368280455470085, + "routers_loss": 0.03637036308646202, "skip_count": 2.0, "step": 1894, "text_loss": 0.2583999037742615 @@ -18010,13 +18010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009541120728236472, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3059497.0, "repeat_count": 0.0, - "routers_loss": 0.0069940583780407906, + "routers_loss": 0.007026574574410915, "skip_count": 0.0, "step": 1896, "text_loss": 0.5222375988960266 @@ -18029,13 +18029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.076171875, "learning_rate": 0.0009539824580973646, - "loss": 0.0221, + "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 3062187.0, "repeat_count": 0.0, - "routers_loss": 0.004268508404493332, + "routers_loss": 0.003449335927143693, "skip_count": 0.0, "step": 1898, "text_loss": 0.5736427307128906 @@ -18048,13 +18048,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.05224609375, "learning_rate": 0.0009538526694041477, - "loss": 0.0159, + "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 3066100.0, "repeat_count": 0.0, - "routers_loss": 0.0032616283278912306, + "routers_loss": 0.0035463871899992228, "skip_count": 0.0, "step": 1900, "text_loss": 0.5471583604812622 @@ -18067,13 +18067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.080078125, "learning_rate": 0.0009537227067937318, - "loss": 0.023, + "loss": 0.0233, "macro_f1": 1.0, "num_tokens": 3068737.0, "repeat_count": 3.0, - "routers_loss": 0.005389219615608454, + "routers_loss": 0.00597514258697629, "skip_count": 3.0, "step": 1902, "text_loss": 0.36644190549850464 @@ -18086,13 +18086,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.166015625, "learning_rate": 0.0009535925703159186, - "loss": 0.0311, + "loss": 0.0301, "macro_f1": 0.32098764181137085, "num_tokens": 3071686.0, "repeat_count": 0.0, - "routers_loss": 0.024814991280436516, + "routers_loss": 0.025420479476451874, "skip_count": 2.0, "step": 1904, "text_loss": 0.535789966583252 @@ -18107,11 +18107,11 @@ "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0009534622600205769, - "loss": 0.0151, + "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3074954.0, "repeat_count": 0.0, - "routers_loss": 0.013415839523077011, + "routers_loss": 0.014377486892044544, "skip_count": 0.0, "step": 1906, "text_loss": 0.19009549915790558 @@ -18124,13 +18124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.115234375, + "grad_norm": 0.11083984375, "learning_rate": 0.0009533317759576416, - "loss": 0.019, + "loss": 0.0197, "macro_f1": 0.3333333432674408, "num_tokens": 3077540.0, "repeat_count": 0.0, - "routers_loss": 0.005814475007355213, + "routers_loss": 0.004848944488912821, "skip_count": 0.0, "step": 1908, "text_loss": 0.5022001266479492 @@ -18143,13 +18143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.07470703125, "learning_rate": 0.0009532011181771148, - "loss": 0.0218, + "loss": 0.0217, "macro_f1": 0.6666666865348816, "num_tokens": 3080445.0, "repeat_count": 0.0, - "routers_loss": 0.007621586322784424, + "routers_loss": 0.009480170905590057, "skip_count": 2.0, "step": 1910, "text_loss": 0.35135936737060547 @@ -18162,13 +18162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09765625, + "grad_norm": 0.10400390625, "learning_rate": 0.0009530702867290644, - "loss": 0.0178, + "loss": 0.0185, "macro_f1": 0.3333333432674408, "num_tokens": 3083657.0, "repeat_count": 0.0, - "routers_loss": 0.0020917020738124847, + "routers_loss": 0.0019353039097040892, "skip_count": 0.0, "step": 1912, "text_loss": 0.5123994946479797 @@ -18181,13 +18181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.123046875, + "grad_norm": 0.1455078125, "learning_rate": 0.0009529392816636256, - "loss": 0.025, + "loss": 0.0249, "macro_f1": 0.3333333432674408, "num_tokens": 3086837.0, "repeat_count": 0.0, - "routers_loss": 0.0010824954370036721, + "routers_loss": 0.0010921972570940852, "skip_count": 0.0, "step": 1914, "text_loss": 0.44477662444114685 @@ -18200,13 +18200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1884765625, + "grad_norm": 0.19140625, "learning_rate": 0.0009528081030309995, - "loss": 0.0353, + "loss": 0.0351, "macro_f1": 0.3333333432674408, "num_tokens": 3089892.0, "repeat_count": 0.0, - "routers_loss": 0.0018075350672006607, + "routers_loss": 0.0018027103506028652, "skip_count": 0.0, "step": 1916, "text_loss": 0.7356183528900146 @@ -18219,13 +18219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009526767508814542, - "loss": 0.0235, + "loss": 0.0236, "macro_f1": 0.3333333432674408, "num_tokens": 3093058.0, "repeat_count": 0.0, - "routers_loss": 0.0032930250745266676, + "routers_loss": 0.003243023296818137, "skip_count": 0.0, "step": 1918, "text_loss": 0.48823556303977966 @@ -18238,13 +18238,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.080078125, "learning_rate": 0.0009525452252653239, - "loss": 0.0184, + "loss": 0.0175, "macro_f1": 0.3333333432674408, "num_tokens": 3096404.0, "repeat_count": 0.0, - "routers_loss": 0.009042349644005299, + "routers_loss": 0.009360014460980892, "skip_count": 0.0, "step": 1920, "text_loss": 0.21498437225818634 @@ -18257,13 +18257,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.1103515625, + "grad_norm": 0.140625, "learning_rate": 0.0009524135262330098, - "loss": 0.022, + "loss": 0.0224, "macro_f1": 0.9265305995941162, "num_tokens": 3099520.0, "repeat_count": 1.0, - "routers_loss": 0.016776500269770622, + "routers_loss": 0.017444295808672905, "skip_count": 3.0, "step": 1922, "text_loss": 0.27608850598335266 @@ -18276,13 +18276,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05029296875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009522816538349789, - "loss": 0.016, + "loss": 0.0162, "macro_f1": 0.5492662787437439, "num_tokens": 3102956.0, "repeat_count": 0.0, - "routers_loss": 0.06579705327749252, + "routers_loss": 0.06424452364444733, "skip_count": 2.0, "step": 1924, "text_loss": 0.21558666229248047 @@ -18295,13 +18295,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05224609375, "learning_rate": 0.0009521496081217651, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3106565.0, "repeat_count": 1.0, - "routers_loss": 0.0022786022163927555, + "routers_loss": 0.002270506462082267, "skip_count": 0.0, "step": 1926, "text_loss": 0.5641813278198242 @@ -18314,13 +18314,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.095703125, "learning_rate": 0.0009520173891439684, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 3109314.0, "repeat_count": 0.0, - "routers_loss": 0.01074281521141529, + "routers_loss": 0.011512448079884052, "skip_count": 1.0, "step": 1928, "text_loss": 0.6351624727249146 @@ -18333,13 +18333,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0830078125, "learning_rate": 0.0009518849969522556, - "loss": 0.0201, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 3112956.0, "repeat_count": 0.0, - "routers_loss": 0.0032052614260464907, + "routers_loss": 0.003883908037096262, "skip_count": 0.0, "step": 1930, "text_loss": 0.35160085558891296 @@ -18352,32 +18352,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10205078125, + "grad_norm": 0.10888671875, "learning_rate": 0.0009517524315973595, - "loss": 0.0186, + "loss": 0.019, "macro_f1": 1.0, "num_tokens": 3115593.0, "repeat_count": 1.0, - "routers_loss": 0.008593574166297913, + "routers_loss": 0.009479222819209099, "skip_count": 3.0, "step": 1932, "text_loss": 0.2900560200214386 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.079835632521279, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.07373046875, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, "learning_rate": 0.0009516196931300794, - "loss": 0.0152, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3118516.0, "repeat_count": 0.0, - "routers_loss": 0.0201246440410614, + "routers_loss": 0.017834696918725967, "skip_count": 2.0, "step": 1934, "text_loss": 0.20094378292560577 @@ -18390,13 +18390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1357421875, + "grad_norm": 0.12890625, "learning_rate": 0.0009514867816012809, - "loss": 0.0199, + "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3122242.0, "repeat_count": 0.0, - "routers_loss": 0.001721356064081192, + "routers_loss": 0.0017964740982279181, "skip_count": 0.0, "step": 1936, "text_loss": 0.6498590707778931 @@ -18409,13 +18409,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.048828125, "learning_rate": 0.0009513536970618961, - "loss": 0.0135, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3125645.0, "repeat_count": 0.0, - "routers_loss": 0.010442634113132954, + "routers_loss": 0.007437168620526791, "skip_count": 2.0, "step": 1938, "text_loss": 0.25863033533096313 @@ -18428,13 +18428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.0625, "learning_rate": 0.0009512204395629232, - "loss": 0.019, + "loss": 0.0184, "macro_f1": 0.6666666865348816, "num_tokens": 3128740.0, "repeat_count": 0.0, - "routers_loss": 0.0009493798715993762, + "routers_loss": 0.0008759932243265212, "skip_count": 1.0, "step": 1940, "text_loss": 0.5638351440429688 @@ -18447,13 +18447,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.06884765625, "learning_rate": 0.0009510870091554264, - "loss": 0.0149, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3131742.0, "repeat_count": 1.0, - "routers_loss": 0.022104881703853607, + "routers_loss": 0.019906625151634216, "skip_count": 0.0, "step": 1942, "text_loss": 0.8410717844963074 @@ -18466,13 +18466,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10400390625, + "grad_norm": 0.12255859375, "learning_rate": 0.0009509534058905369, - "loss": 0.0164, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3134407.0, "repeat_count": 0.0, - "routers_loss": 0.0009013625676743686, + "routers_loss": 0.0009229081333614886, "skip_count": 0.0, "step": 1944, "text_loss": 0.47506049275398254 @@ -18485,13 +18485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0576171875, "learning_rate": 0.0009508196298194517, - "loss": 0.0121, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3137053.0, "repeat_count": 0.0, - "routers_loss": 0.0028069843538105488, + "routers_loss": 0.003630586201325059, "skip_count": 0.0, "step": 1946, "text_loss": 0.32225799560546875 @@ -18504,13 +18504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.08349609375, "learning_rate": 0.0009506856809934338, - "loss": 0.0116, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 3140943.0, "repeat_count": 0.0, - "routers_loss": 0.006877045147120953, + "routers_loss": 0.007580445148050785, "skip_count": 0.0, "step": 1948, "text_loss": 0.3120577931404114 @@ -18523,13 +18523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009505515594638127, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3144298.0, "repeat_count": 0.0, - "routers_loss": 0.004543667659163475, + "routers_loss": 0.004471861757338047, "skip_count": 0.0, "step": 1950, "text_loss": 0.22052447497844696 @@ -18542,13 +18542,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.09130859375, "learning_rate": 0.0009504172652819843, - "loss": 0.0232, + "loss": 0.023, "macro_f1": 1.0, "num_tokens": 3147069.0, "repeat_count": 1.0, - "routers_loss": 0.007053609937429428, + "routers_loss": 0.009606664068996906, "skip_count": 1.0, "step": 1952, "text_loss": 0.34773921966552734 @@ -18561,13 +18561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.0625, "learning_rate": 0.0009502827984994099, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.6666666865348816, "num_tokens": 3149992.0, "repeat_count": 0.0, - "routers_loss": 0.006783280987292528, + "routers_loss": 0.006443799939006567, "skip_count": 1.0, "step": 1954, "text_loss": 0.6442171335220337 @@ -18580,13 +18580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009501481591676177, - "loss": 0.0181, + "loss": 0.0188, "macro_f1": 0.3333333432674408, "num_tokens": 3153167.0, "repeat_count": 0.0, - "routers_loss": 0.002531677018851042, + "routers_loss": 0.003219039412215352, "skip_count": 0.0, "step": 1956, "text_loss": 0.43369221687316895 @@ -18599,32 +18599,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07470703125, "learning_rate": 0.000950013347338202, - "loss": 0.0154, + "loss": 0.0152, "macro_f1": 0.3272727429866791, "num_tokens": 3156590.0, "repeat_count": 0.0, - "routers_loss": 0.027040868997573853, + "routers_loss": 0.025551019236445427, "skip_count": 1.0, "step": 1958, "text_loss": 0.294479101896286 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.201937188142061, - "f1_execute": 0.9803921580314636, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.1630859375, "learning_rate": 0.0009498783630628225, - "loss": 0.0154, - "macro_f1": 0.6601307392120361, + "loss": 0.0158, + "macro_f1": 1.0, "num_tokens": 3159451.0, "repeat_count": 1.0, - "routers_loss": 0.01573321223258972, + "routers_loss": 0.013802438974380493, "skip_count": 2.0, "step": 1960, "text_loss": 0.20888492465019226 @@ -18637,13 +18637,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009497432063932057, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.6601307392120361, "num_tokens": 3162889.0, "repeat_count": 1.0, - "routers_loss": 0.02442278526723385, + "routers_loss": 0.02852988988161087, "skip_count": 2.0, "step": 1962, "text_loss": 0.5027125477790833 @@ -18656,13 +18656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.045166015625, "learning_rate": 0.0009496078773811437, - "loss": 0.0142, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 3165979.0, "repeat_count": 0.0, - "routers_loss": 0.018267054110765457, + "routers_loss": 0.01784522272646427, "skip_count": 2.0, "step": 1964, "text_loss": 0.1696339100599289 @@ -18675,13 +18675,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.060302734375, "learning_rate": 0.000949472376078495, - "loss": 0.0162, + "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3168683.0, "repeat_count": 0.0, - "routers_loss": 0.0016024474753066897, + "routers_loss": 0.0017019887454807758, "skip_count": 0.0, "step": 1966, "text_loss": 0.48905447125434875 @@ -18694,13 +18694,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.051025390625, "learning_rate": 0.000949336702537184, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3171968.0, "repeat_count": 0.0, - "routers_loss": 0.004668849054723978, + "routers_loss": 0.004817947279661894, "skip_count": 2.0, "step": 1968, "text_loss": 0.20984773337841034 @@ -18713,13 +18713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0009492008568092007, - "loss": 0.0098, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3175947.0, "repeat_count": 0.0, - "routers_loss": 0.0011657609138637781, + "routers_loss": 0.0012963006738573313, "skip_count": 0.0, "step": 1970, "text_loss": 0.5215106010437012 @@ -18732,13 +18732,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.04248046875, + "grad_norm": 0.044921875, "learning_rate": 0.0009490648389466019, - "loss": 0.0133, + "loss": 0.0135, "macro_f1": 0.4871794879436493, "num_tokens": 3179348.0, "repeat_count": 0.0, - "routers_loss": 0.03806794434785843, + "routers_loss": 0.03950481489300728, "skip_count": 2.0, "step": 1972, "text_loss": 0.24640929698944092 @@ -18751,13 +18751,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08837890625, + "grad_norm": 0.09326171875, "learning_rate": 0.0009489286490015097, - "loss": 0.0189, + "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3182640.0, "repeat_count": 0.0, - "routers_loss": 0.005107097327709198, + "routers_loss": 0.0043345349840819836, "skip_count": 2.0, "step": 1974, "text_loss": 0.6362852454185486 @@ -18770,13 +18770,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07958984375, "learning_rate": 0.0009487922870261122, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3185657.0, "repeat_count": 0.0, - "routers_loss": 0.0013696947135031223, + "routers_loss": 0.0015687479171901941, "skip_count": 0.0, "step": 1976, "text_loss": 0.8977144360542297 @@ -18789,13 +18789,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.0009486557530726638, - "loss": 0.0136, + "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 3188772.0, "repeat_count": 0.0, - "routers_loss": 0.0012224154779687524, + "routers_loss": 0.0010977238416671753, "skip_count": 0.0, "step": 1978, "text_loss": 0.38512736558914185 @@ -18808,13 +18808,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.11279296875, "learning_rate": 0.0009485190471934844, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 3193131.0, "repeat_count": 2.0, - "routers_loss": 0.0030119111761450768, + "routers_loss": 0.002264744369313121, "skip_count": 0.0, "step": 1980, "text_loss": 0.4171289801597595 @@ -18827,13 +18827,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.12451171875, + "grad_norm": 0.09033203125, "learning_rate": 0.00094838216944096, - "loss": 0.0222, + "loss": 0.0219, "macro_f1": 0.3272727429866791, "num_tokens": 3196668.0, "repeat_count": 0.0, - "routers_loss": 0.04286033287644386, + "routers_loss": 0.042320676147937775, "skip_count": 1.0, "step": 1982, "text_loss": 0.19008000195026398 @@ -18846,32 +18846,32 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009482451198675424, - "loss": 0.0158, + "loss": 0.0151, "macro_f1": 0.32098767161369324, "num_tokens": 3200282.0, "repeat_count": 0.0, - "routers_loss": 0.019988590851426125, + "routers_loss": 0.01796630397439003, "skip_count": 1.0, "step": 1984, "text_loss": 0.5009249448776245 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.324038743762841, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, "learning_rate": 0.0009481078985257494, - "loss": 0.0154, - "macro_f1": 0.3272727429866791, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, "num_tokens": 3204439.0, "repeat_count": 0.0, - "routers_loss": 0.012215938419103622, + "routers_loss": 0.01052347756922245, "skip_count": 1.0, "step": 1986, "text_loss": 0.15319275856018066 @@ -18884,13 +18884,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0732421875, "learning_rate": 0.0009479705054681644, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.3076923191547394, "num_tokens": 3207590.0, "repeat_count": 1.0, - "routers_loss": 0.10747655481100082, + "routers_loss": 0.09640293568372726, "skip_count": 3.0, "step": 1988, "text_loss": 0.3654652535915375 @@ -18903,13 +18903,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.068359375, + "grad_norm": 0.06689453125, "learning_rate": 0.0009478329407474366, - "loss": 0.0186, + "loss": 0.0183, "macro_f1": 0.5492662787437439, "num_tokens": 3211172.0, "repeat_count": 0.0, - "routers_loss": 0.016109853982925415, + "routers_loss": 0.012670112773776054, "skip_count": 1.0, "step": 1990, "text_loss": 0.5817596316337585 @@ -18922,13 +18922,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05859375, "learning_rate": 0.000947695204416281, - "loss": 0.0116, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3214050.0, "repeat_count": 1.0, - "routers_loss": 0.006929324474185705, + "routers_loss": 0.005263707600533962, "skip_count": 0.0, "step": 1992, "text_loss": 0.5985888242721558 @@ -18941,13 +18941,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0634765625, "learning_rate": 0.0009475572965274787, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3272727429866791, "num_tokens": 3217318.0, "repeat_count": 1.0, - "routers_loss": 0.0715102106332779, + "routers_loss": 0.0682850033044815, "skip_count": 0.0, "step": 1994, "text_loss": 0.316506564617157 @@ -18960,13 +18960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0595703125, "learning_rate": 0.000947419217133876, - "loss": 0.0187, + "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 3220012.0, "repeat_count": 0.0, - "routers_loss": 0.008499355986714363, + "routers_loss": 0.008508823812007904, "skip_count": 2.0, "step": 1996, "text_loss": 0.09665893763303757 @@ -18979,13 +18979,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.053466796875, "learning_rate": 0.0009472809662883852, - "loss": 0.0162, + "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3223019.0, "repeat_count": 1.0, - "routers_loss": 0.012003371492028236, + "routers_loss": 0.01100847590714693, "skip_count": 2.0, "step": 1998, "text_loss": 0.4938808083534241 @@ -18998,13 +18998,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0009471425440439844, - "loss": 0.0137, + "loss": 0.0135, "macro_f1": 0.8817967176437378, "num_tokens": 3226013.0, "repeat_count": 2.0, - "routers_loss": 0.0529167577624321, + "routers_loss": 0.04953207075595856, "skip_count": 3.0, "step": 2000, "text_loss": 0.22258254885673523 @@ -19017,13 +19017,13 @@ "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.07568359375, "learning_rate": 0.0009470039504537173, - "loss": 0.0185, + "loss": 0.0186, "macro_f1": 0.31446540355682373, "num_tokens": 3230031.0, "repeat_count": 0.0, - "routers_loss": 0.05719539523124695, + "routers_loss": 0.052884332835674286, "skip_count": 2.0, "step": 2002, "text_loss": 0.1741616576910019 @@ -19038,11 +19038,11 @@ "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0009468651855706931, - "loss": 0.0205, + "loss": 0.0204, "macro_f1": 0.6666666865348816, "num_tokens": 3232991.0, "repeat_count": 1.0, - "routers_loss": 0.007613501511514187, + "routers_loss": 0.008056716993451118, "skip_count": 0.0, "step": 2004, "text_loss": 0.3173636198043823 @@ -19055,13 +19055,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0654296875, "learning_rate": 0.0009467262494480868, - "loss": 0.014, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3236390.0, "repeat_count": 0.0, - "routers_loss": 0.005654903594404459, + "routers_loss": 0.0053409393876791, "skip_count": 0.0, "step": 2006, "text_loss": 0.5806330442428589 @@ -19074,13 +19074,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.068359375, "learning_rate": 0.000946587142139139, - "loss": 0.0152, + "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 3239267.0, "repeat_count": 0.0, - "routers_loss": 0.001680699409916997, + "routers_loss": 0.0015652200672775507, "skip_count": 0.0, "step": 2008, "text_loss": 0.6214317679405212 @@ -19093,13 +19093,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1943359375, + "grad_norm": 0.11376953125, "learning_rate": 0.000946447863697156, - "loss": 0.0171, + "loss": 0.0151, "macro_f1": 0.6601307392120361, "num_tokens": 3242569.0, "repeat_count": 1.0, - "routers_loss": 0.014179535210132599, + "routers_loss": 0.011673987843096256, "skip_count": 2.0, "step": 2010, "text_loss": 0.532565712928772 @@ -19112,13 +19112,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04345703125, "learning_rate": 0.0009463084141755093, - "loss": 0.0157, + "loss": 0.0159, "macro_f1": 0.3272727429866791, "num_tokens": 3245669.0, "repeat_count": 0.0, - "routers_loss": 0.026209332048892975, + "routers_loss": 0.028480790555477142, "skip_count": 1.0, "step": 2012, "text_loss": 0.25210800766944885 @@ -19131,13 +19131,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.0869140625, "learning_rate": 0.0009461687936276364, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3248751.0, "repeat_count": 0.0, - "routers_loss": 0.008315940387547016, + "routers_loss": 0.007234727032482624, "skip_count": 0.0, "step": 2014, "text_loss": 0.35922971367836 @@ -19150,13 +19150,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.068359375, "learning_rate": 0.0009460290021070402, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6666666865348816, "num_tokens": 3252614.0, "repeat_count": 1.0, - "routers_loss": 0.01872348040342331, + "routers_loss": 0.014691276475787163, "skip_count": 0.0, "step": 2016, "text_loss": 0.2747853398323059 @@ -19169,13 +19169,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009458890396672888, "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3256374.0, "repeat_count": 0.0, - "routers_loss": 0.0024314222391694784, + "routers_loss": 0.002385235857218504, "skip_count": 0.0, "step": 2018, "text_loss": 0.5268719792366028 @@ -19188,13 +19188,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052978515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009457489063620164, - "loss": 0.0137, + "loss": 0.0133, "macro_f1": 0.8823530077934265, "num_tokens": 3259792.0, "repeat_count": 1.0, - "routers_loss": 0.04815426841378212, + "routers_loss": 0.047268565744161606, "skip_count": 2.0, "step": 2020, "text_loss": 0.7785539627075195 @@ -19207,13 +19207,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.13671875, + "grad_norm": 0.1494140625, "learning_rate": 0.0009456086022449221, - "loss": 0.0209, + "loss": 0.0218, "macro_f1": 0.3272727429866791, "num_tokens": 3262833.0, "repeat_count": 0.0, - "routers_loss": 0.015121756121516228, + "routers_loss": 0.015878718346357346, "skip_count": 1.0, "step": 2022, "text_loss": 0.42270028591156006 @@ -19226,32 +19226,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.08935546875, "learning_rate": 0.0009454681273697711, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3272727429866791, "num_tokens": 3265718.0, "repeat_count": 1.0, - "routers_loss": 0.030219297856092453, + "routers_loss": 0.030749641358852386, "skip_count": 0.0, "step": 2024, "text_loss": 0.18668225407600403 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.511887290871735, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, "learning_rate": 0.0009453274817903931, - "loss": 0.0132, - "macro_f1": 0.3272727429866791, + "loss": 0.012, + "macro_f1": 0.6666666865348816, "num_tokens": 3268158.0, "repeat_count": 0.0, - "routers_loss": 0.013256299309432507, + "routers_loss": 0.011538166552782059, "skip_count": 1.0, "step": 2026, "text_loss": 0.34090787172317505 @@ -19264,13 +19264,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11572265625, + "grad_norm": 0.099609375, "learning_rate": 0.000945186665560684, - "loss": 0.0232, + "loss": 0.0218, "macro_f1": 0.3333333432674408, "num_tokens": 3271082.0, "repeat_count": 0.0, - "routers_loss": 0.009389489889144897, + "routers_loss": 0.009527760557830334, "skip_count": 0.0, "step": 2028, "text_loss": 0.2110334187746048 @@ -19283,13 +19283,13 @@ "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.119140625, "learning_rate": 0.000945045678734605, - "loss": 0.0178, + "loss": 0.0175, "macro_f1": 0.3144654333591461, "num_tokens": 3273488.0, "repeat_count": 0.0, - "routers_loss": 0.03916877508163452, + "routers_loss": 0.03317151218652725, "skip_count": 3.0, "step": 2030, "text_loss": 0.2233227640390396 @@ -19302,13 +19302,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.12451171875, "learning_rate": 0.0009449045213661822, - "loss": 0.0215, + "loss": 0.0201, "macro_f1": 0.3272727429866791, "num_tokens": 3276646.0, "repeat_count": 0.0, - "routers_loss": 0.019781047478318214, + "routers_loss": 0.018510591238737106, "skip_count": 1.0, "step": 2032, "text_loss": 0.16100332140922546 @@ -19321,13 +19321,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.11474609375, + "grad_norm": 0.1318359375, "learning_rate": 0.0009447631935095077, - "loss": 0.0193, + "loss": 0.0185, "macro_f1": 0.9452888369560242, "num_tokens": 3279441.0, "repeat_count": 1.0, - "routers_loss": 0.02645993046462536, + "routers_loss": 0.028113311156630516, "skip_count": 4.0, "step": 2034, "text_loss": 0.29208317399024963 @@ -19340,13 +19340,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009446216952187384, - "loss": 0.0168, + "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3282697.0, "repeat_count": 0.0, - "routers_loss": 0.008575125597417355, + "routers_loss": 0.008379172533750534, "skip_count": 0.0, "step": 2036, "text_loss": 0.16026398539543152 @@ -19359,13 +19359,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06298828125, "learning_rate": 0.0009444800265480967, - "loss": 0.0184, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3285574.0, "repeat_count": 0.0, - "routers_loss": 0.01042154710739851, + "routers_loss": 0.00941354501992464, "skip_count": 0.0, "step": 2038, "text_loss": 0.29523080587387085 @@ -19378,13 +19378,13 @@ "f1_execute": 0.9230769276618958, "f1_repeat": 0.8571428656578064, "f1_skip": 0.800000011920929, - "grad_norm": 0.07568359375, + "grad_norm": 0.076171875, "learning_rate": 0.0009443381875518703, - "loss": 0.0206, + "loss": 0.0197, "macro_f1": 0.8600732684135437, "num_tokens": 3289159.0, "repeat_count": 4.0, - "routers_loss": 0.05496715381741524, + "routers_loss": 0.04974055662751198, "skip_count": 6.0, "step": 2040, "text_loss": 0.23033179342746735 @@ -19397,13 +19397,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0625, + "grad_norm": 0.0537109375, "learning_rate": 0.0009441961782844123, - "loss": 0.0149, + "loss": 0.0146, "macro_f1": 0.3272727429866791, "num_tokens": 3293598.0, "repeat_count": 0.0, - "routers_loss": 0.021722445264458656, + "routers_loss": 0.022241825237870216, "skip_count": 1.0, "step": 2042, "text_loss": 0.8299165368080139 @@ -19416,13 +19416,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.0673828125, "learning_rate": 0.0009440539988001408, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3296648.0, "repeat_count": 0.0, - "routers_loss": 0.011090370826423168, + "routers_loss": 0.011019332334399223, "skip_count": 0.0, "step": 2044, "text_loss": 0.18207129836082458 @@ -19435,13 +19435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009439116491535394, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3300058.0, "repeat_count": 0.0, - "routers_loss": 0.00327755743637681, + "routers_loss": 0.002889640862122178, "skip_count": 0.0, "step": 2046, "text_loss": 0.7051978707313538 @@ -19454,13 +19454,13 @@ "f1_execute": 0.9333333373069763, "f1_repeat": 0.5, "f1_skip": 0.8571428656578064, - "grad_norm": 0.08154296875, + "grad_norm": 0.078125, "learning_rate": 0.0009437691293991563, - "loss": 0.0198, + "loss": 0.0192, "macro_f1": 0.7634921073913574, "num_tokens": 3303296.0, "repeat_count": 3.0, - "routers_loss": 0.0807223841547966, + "routers_loss": 0.07741832733154297, "skip_count": 4.0, "step": 2048, "text_loss": 0.15563532710075378 @@ -19473,13 +19473,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.09521484375, "learning_rate": 0.0009436264395916061, - "loss": 0.0218, + "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 3306204.0, "repeat_count": 0.0, - "routers_loss": 0.014681774191558361, + "routers_loss": 0.014225383289158344, "skip_count": 2.0, "step": 2050, "text_loss": 0.18117287755012512 @@ -19492,13 +19492,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09326171875, + "grad_norm": 0.1416015625, "learning_rate": 0.0009434835797855672, - "loss": 0.0166, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 3309444.0, "repeat_count": 0.0, - "routers_loss": 0.0025602662935853004, + "routers_loss": 0.0023932650219649076, "skip_count": 0.0, "step": 2052, "text_loss": 0.4645874798297882 @@ -19511,13 +19511,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05810546875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009433405500357839, - "loss": 0.0148, + "loss": 0.0153, "macro_f1": 0.3272727429866791, "num_tokens": 3312488.0, "repeat_count": 0.0, - "routers_loss": 0.03283753618597984, + "routers_loss": 0.03193361684679985, "skip_count": 1.0, "step": 2054, "text_loss": 0.5291082859039307 @@ -19530,13 +19530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.064453125, "learning_rate": 0.0009431973503970655, - "loss": 0.0138, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3315765.0, "repeat_count": 0.0, - "routers_loss": 0.002137230010703206, + "routers_loss": 0.0020529816392809153, "skip_count": 0.0, "step": 2056, "text_loss": 0.5877931118011475 @@ -19549,13 +19549,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08251953125, + "grad_norm": 0.07275390625, "learning_rate": 0.0009430539809242864, - "loss": 0.0199, + "loss": 0.0185, "macro_f1": 0.32098764181137085, "num_tokens": 3318877.0, "repeat_count": 2.0, - "routers_loss": 0.07938452064990997, + "routers_loss": 0.07907948642969131, "skip_count": 0.0, "step": 2058, "text_loss": 0.3836737871170044 @@ -19568,13 +19568,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.095703125, "learning_rate": 0.0009429104416723862, - "loss": 0.0164, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 3322576.0, "repeat_count": 2.0, - "routers_loss": 0.003832251997664571, + "routers_loss": 0.003006070153787732, "skip_count": 0.0, "step": 2060, "text_loss": 0.3480920195579529 @@ -19587,13 +19587,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.045166015625, "learning_rate": 0.0009427667326963689, - "loss": 0.0131, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3325974.0, "repeat_count": 0.0, - "routers_loss": 0.006192604545503855, + "routers_loss": 0.005013179033994675, "skip_count": 0.0, "step": 2062, "text_loss": 0.931358814239502 @@ -19606,13 +19606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09375, + "grad_norm": 0.0986328125, "learning_rate": 0.0009426228540513047, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 3329398.0, "repeat_count": 0.0, - "routers_loss": 0.008115313947200775, + "routers_loss": 0.0059848143719136715, "skip_count": 0.0, "step": 2064, "text_loss": 0.47568953037261963 @@ -19625,13 +19625,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0830078125, "learning_rate": 0.0009424788057923277, - "loss": 0.0127, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3332029.0, "repeat_count": 0.0, - "routers_loss": 0.007599714212119579, + "routers_loss": 0.00783882662653923, "skip_count": 0.0, "step": 2066, "text_loss": 0.22887596487998962 @@ -19644,13 +19644,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07470703125, + "grad_norm": 0.0712890625, "learning_rate": 0.0009423345879746376, - "loss": 0.0126, + "loss": 0.0128, "macro_f1": 0.5492662787437439, "num_tokens": 3334858.0, "repeat_count": 0.0, - "routers_loss": 0.016804348677396774, + "routers_loss": 0.01866884157061577, "skip_count": 2.0, "step": 2068, "text_loss": 0.17724967002868652 @@ -19663,13 +19663,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.06591796875, "learning_rate": 0.000942190200653499, - "loss": 0.0164, + "loss": 0.0162, "macro_f1": 0.32098764181137085, "num_tokens": 3338094.0, "repeat_count": 0.0, - "routers_loss": 0.02686731517314911, + "routers_loss": 0.028636593371629715, "skip_count": 2.0, "step": 2070, "text_loss": 0.34344956278800964 @@ -19682,13 +19682,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0771484375, + "grad_norm": 0.07568359375, "learning_rate": 0.0009420456438842413, - "loss": 0.0172, + "loss": 0.0165, "macro_f1": 0.5492662787437439, "num_tokens": 3340526.0, "repeat_count": 0.0, - "routers_loss": 0.025320913642644882, + "routers_loss": 0.023245645686984062, "skip_count": 2.0, "step": 2072, "text_loss": 0.7276164293289185 @@ -19701,13 +19701,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11376953125, + "grad_norm": 0.11328125, "learning_rate": 0.000941900917722259, - "loss": 0.0145, + "loss": 0.0143, "macro_f1": 0.3272727429866791, "num_tokens": 3343303.0, "repeat_count": 1.0, - "routers_loss": 0.014900023117661476, + "routers_loss": 0.01565689593553543, "skip_count": 0.0, "step": 2074, "text_loss": 0.5665070414543152 @@ -19720,13 +19720,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.1201171875, "learning_rate": 0.0009417560222230115, - "loss": 0.0244, + "loss": 0.0245, "macro_f1": 0.3333333432674408, "num_tokens": 3346409.0, "repeat_count": 0.0, - "routers_loss": 0.003426895011216402, + "routers_loss": 0.0035056080669164658, "skip_count": 0.0, "step": 2076, "text_loss": 0.5112795233726501 @@ -19739,13 +19739,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0712890625, + "grad_norm": 0.06982421875, "learning_rate": 0.0009416109574420229, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3349220.0, "repeat_count": 0.0, - "routers_loss": 0.0031935563310980797, + "routers_loss": 0.0027565446216613054, "skip_count": 0.0, "step": 2078, "text_loss": 0.5240910053253174 @@ -19758,13 +19758,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.08203125, "learning_rate": 0.0009414657234348823, - "loss": 0.0183, + "loss": 0.0186, "macro_f1": 1.0, "num_tokens": 3352627.0, "repeat_count": 3.0, - "routers_loss": 0.016454946249723434, + "routers_loss": 0.01652451977133751, "skip_count": 2.0, "step": 2080, "text_loss": 1.0217112302780151 @@ -19777,13 +19777,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1650390625, + "grad_norm": 0.1630859375, "learning_rate": 0.0009413203202572438, - "loss": 0.0174, + "loss": 0.0179, "macro_f1": 0.32098764181137085, "num_tokens": 3355392.0, "repeat_count": 0.0, - "routers_loss": 0.1056143268942833, + "routers_loss": 0.1012420505285263, "skip_count": 2.0, "step": 2082, "text_loss": 0.4085482358932495 @@ -19796,13 +19796,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.08251953125, "learning_rate": 0.000941174747964826, - "loss": 0.016, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3358425.0, "repeat_count": 0.0, - "routers_loss": 0.003626141929998994, + "routers_loss": 0.004962718114256859, "skip_count": 0.0, "step": 2084, "text_loss": 0.5833504796028137 @@ -19810,18 +19810,18 @@ { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "avg_layers": 27.0, "epoch": 9.793660111535075, - "f1_execute": 0.936170220375061, + "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.107421875, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, "learning_rate": 0.0009410290066134124, - "loss": 0.0216, - "macro_f1": 0.7565011978149414, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, "num_tokens": 3361925.0, "repeat_count": 2.0, - "routers_loss": 0.08091846853494644, + "routers_loss": 0.07889176905155182, "skip_count": 3.0, "step": 2086, "text_loss": 0.38126569986343384 @@ -19834,13 +19834,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.051513671875, "learning_rate": 0.0009408830962588517, - "loss": 0.0197, + "loss": 0.0195, "macro_f1": 0.6601307392120361, "num_tokens": 3365963.0, "repeat_count": 1.0, - "routers_loss": 0.035208042711019516, + "routers_loss": 0.033715736120939255, "skip_count": 2.0, "step": 2088, "text_loss": 0.23213914036750793 @@ -19853,13 +19853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0732421875, "learning_rate": 0.0009407370169570567, - "loss": 0.0167, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3369422.0, "repeat_count": 0.0, - "routers_loss": 0.0018934847321361303, + "routers_loss": 0.0014188943896442652, "skip_count": 0.0, "step": 2090, "text_loss": 0.4648318886756897 @@ -19872,13 +19872,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009405907687640054, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 3372506.0, "repeat_count": 0.0, - "routers_loss": 0.016075141727924347, + "routers_loss": 0.015339684672653675, "skip_count": 1.0, "step": 2092, "text_loss": 0.2563800811767578 @@ -19891,13 +19891,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.054443359375, "learning_rate": 0.0009404443517357404, "loss": 0.0146, "macro_f1": 0.542222261428833, "num_tokens": 3375653.0, "repeat_count": 4.0, - "routers_loss": 0.06333976984024048, + "routers_loss": 0.06562861055135727, "skip_count": 0.0, "step": 2094, "text_loss": 0.797835111618042 @@ -19910,13 +19910,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.062255859375, "learning_rate": 0.000940297765928369, - "loss": 0.0133, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 3379018.0, "repeat_count": 0.0, - "routers_loss": 0.005521406419575214, + "routers_loss": 0.005745889153331518, "skip_count": 0.0, "step": 2096, "text_loss": 0.4238114655017853 @@ -19929,13 +19929,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.0712890625, "learning_rate": 0.0009401510113980631, - "loss": 0.0205, + "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3382855.0, "repeat_count": 0.0, - "routers_loss": 0.0025159218348562717, + "routers_loss": 0.0026634482201188803, "skip_count": 0.0, "step": 2098, "text_loss": 0.4967166483402252 @@ -19948,13 +19948,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08203125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009400040882010592, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 3386386.0, "repeat_count": 0.0, - "routers_loss": 0.0025535966269671917, + "routers_loss": 0.0020642587915062904, "skip_count": 0.0, "step": 2100, "text_loss": 0.44390562176704407 @@ -19967,13 +19967,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.056640625, "learning_rate": 0.0009398569963936589, - "loss": 0.0178, + "loss": 0.017, "macro_f1": 0.3272727429866791, "num_tokens": 3389958.0, "repeat_count": 0.0, - "routers_loss": 0.013569516129791737, + "routers_loss": 0.013722737319767475, "skip_count": 1.0, "step": 2102, "text_loss": 0.7207565903663635 @@ -19986,13 +19986,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.08837890625, "learning_rate": 0.0009397097360322276, - "loss": 0.0175, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3392892.0, "repeat_count": 0.0, - "routers_loss": 0.0044935219921171665, + "routers_loss": 0.002051608171314001, "skip_count": 0.0, "step": 2104, "text_loss": 0.3196398913860321 @@ -20005,13 +20005,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.07470703125, "learning_rate": 0.000939562307173196, - "loss": 0.0223, + "loss": 0.022, "macro_f1": 0.3333333432674408, "num_tokens": 3396636.0, "repeat_count": 0.0, - "routers_loss": 0.007407462690025568, + "routers_loss": 0.007085663266479969, "skip_count": 0.0, "step": 2106, "text_loss": 0.5663776397705078 @@ -20024,13 +20024,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.13671875, + "grad_norm": 0.11328125, "learning_rate": 0.0009394147098730592, - "loss": 0.0205, + "loss": 0.02, "macro_f1": 0.5492662787437439, "num_tokens": 3399475.0, "repeat_count": 0.0, - "routers_loss": 0.024386432021856308, + "routers_loss": 0.019473131746053696, "skip_count": 2.0, "step": 2108, "text_loss": 0.7708223462104797 @@ -20043,32 +20043,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.038818359375, "learning_rate": 0.0009392669441883767, - "loss": 0.0135, + "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 3402350.0, "repeat_count": 0.0, - "routers_loss": 0.002929724520072341, + "routers_loss": 0.0028328890912234783, "skip_count": 0.0, "step": 2110, "text_loss": 0.5888006091117859 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 9.915761667155856, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1201171875, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, "learning_rate": 0.0009391190101757724, - "loss": 0.0168, - "macro_f1": 0.5492662787437439, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, "num_tokens": 3405561.0, "repeat_count": 0.0, - "routers_loss": 0.026861928403377533, + "routers_loss": 0.023098422214388847, "skip_count": 2.0, "step": 2112, "text_loss": 0.09865197539329529 @@ -20081,13 +20081,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0966796875, + "grad_norm": 0.10107421875, "learning_rate": 0.000938970907891935, - "loss": 0.0251, + "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 3408513.0, "repeat_count": 0.0, - "routers_loss": 0.0025369988288730383, + "routers_loss": 0.002896632067859173, "skip_count": 0.0, "step": 2114, "text_loss": 0.6613234281539917 @@ -20100,51 +20100,51 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09423828125, + "grad_norm": 0.0947265625, "learning_rate": 0.0009388226373936179, - "loss": 0.0209, + "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 3411195.0, "repeat_count": 0.0, - "routers_loss": 0.014292459934949875, + "routers_loss": 0.015814457088708878, "skip_count": 0.0, "step": 2116, "text_loss": 0.17363053560256958 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 27.0, + "avg_layers": 28.0, "epoch": 9.94393894922219, - "f1_execute": 0.9629629850387573, - "f1_repeat": 0.0, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1181640625, + "grad_norm": 0.12451171875, "learning_rate": 0.0009386741987376381, - "loss": 0.0151, - "macro_f1": 0.32098767161369324, + "loss": 0.015, + "macro_f1": 0.6603773832321167, "num_tokens": 3414875.0, "repeat_count": 1.0, - "routers_loss": 0.027571436017751694, + "routers_loss": 0.02676783688366413, "skip_count": 0.0, "step": 2118, "text_loss": 0.674056887626648 }, { "acc_repeat": 0.0, - "acc_skip": 0.0, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 27.0, "epoch": 9.953331376577633, - "f1_execute": 0.9818181991577148, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.0, - "grad_norm": 0.08349609375, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, "learning_rate": 0.0009385255919808778, - "loss": 0.0205, - "macro_f1": 0.3272727429866791, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, "num_tokens": 3418410.0, "repeat_count": 0.0, - "routers_loss": 0.011719600297510624, + "routers_loss": 0.01022857241332531, "skip_count": 1.0, "step": 2120, "text_loss": 0.235092431306839 @@ -20157,13 +20157,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09375, + "grad_norm": 0.0888671875, "learning_rate": 0.0009383768171802836, - "loss": 0.0249, + "loss": 0.0244, "macro_f1": 0.5492662787437439, "num_tokens": 3421289.0, "repeat_count": 0.0, - "routers_loss": 0.01207603607326746, + "routers_loss": 0.013572212308645248, "skip_count": 2.0, "step": 2122, "text_loss": 0.5992844104766846 @@ -20176,13 +20176,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.064453125, "learning_rate": 0.0009382278743928659, - "loss": 0.0206, + "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 3424781.0, "repeat_count": 0.0, - "routers_loss": 0.008004254661500454, + "routers_loss": 0.0051873656921088696, "skip_count": 2.0, "step": 2124, "text_loss": 0.29915499687194824 @@ -20195,13 +20195,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.07666015625, + "grad_norm": 0.07421875, "learning_rate": 0.0009380787636757001, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6122449040412903, "num_tokens": 3427942.0, "repeat_count": 0.0, - "routers_loss": 0.030767880380153656, + "routers_loss": 0.030079292133450508, "skip_count": 4.0, "step": 2126, "text_loss": 0.24181491136550903 @@ -20214,13 +20214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.058349609375, "learning_rate": 0.0009379294850859256, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3431314.0, "repeat_count": 0.0, - "routers_loss": 0.002620625076815486, + "routers_loss": 0.002675612922757864, "skip_count": 0.0, "step": 2128, "text_loss": 0.4669873118400574 @@ -20233,13 +20233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09033203125, + "grad_norm": 0.10595703125, "learning_rate": 0.0009377800386807465, - "loss": 0.0175, + "loss": 0.0177, "macro_f1": 0.3333333432674408, "num_tokens": 3435020.0, "repeat_count": 0.0, - "routers_loss": 0.009095560759305954, + "routers_loss": 0.009334275498986244, "skip_count": 0.0, "step": 2130, "text_loss": 0.6478219628334045 @@ -20252,13 +20252,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.11865234375, + "grad_norm": 0.134765625, "learning_rate": 0.0009376304245174306, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.6000000238418579, "num_tokens": 3438276.0, "repeat_count": 1.0, - "routers_loss": 0.058448426425457, + "routers_loss": 0.038227908313274384, "skip_count": 2.0, "step": 2132, "text_loss": 0.4401201903820038 @@ -20271,13 +20271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.041748046875, "learning_rate": 0.0009374806426533104, - "loss": 0.0116, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3440938.0, "repeat_count": 0.0, - "routers_loss": 0.007323687430471182, + "routers_loss": 0.006901399698108435, "skip_count": 0.0, "step": 2134, "text_loss": 0.5948942303657532 @@ -20290,13 +20290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.051025390625, "learning_rate": 0.0009373306931457827, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3444028.0, "repeat_count": 0.0, - "routers_loss": 0.003302243771031499, + "routers_loss": 0.0037061909679323435, "skip_count": 0.0, "step": 2136, "text_loss": 0.5349751114845276 @@ -20309,13 +20309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.056884765625, "learning_rate": 0.0009371805760523086, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 3448331.0, "repeat_count": 0.0, - "routers_loss": 0.0027974818367511034, + "routers_loss": 0.0025877030566334724, "skip_count": 0.0, "step": 2138, "text_loss": 0.4591051936149597 @@ -20328,13 +20328,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.08642578125, + "grad_norm": 0.07373046875, "learning_rate": 0.0009370302914304129, - "loss": 0.0145, + "loss": 0.0144, "macro_f1": 0.5934640765190125, "num_tokens": 3451434.0, "repeat_count": 0.0, - "routers_loss": 0.01572767272591591, + "routers_loss": 0.018742674961686134, "skip_count": 3.0, "step": 2140, "text_loss": 0.23470863699913025 @@ -20347,13 +20347,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009368798393376851, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 3454375.0, "repeat_count": 0.0, - "routers_loss": 0.020721890032291412, + "routers_loss": 0.02382594160735607, "skip_count": 1.0, "step": 2142, "text_loss": 0.6077954769134521 @@ -20366,13 +20366,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05859375, + "grad_norm": 0.05517578125, "learning_rate": 0.0009367292198317787, - "loss": 0.0161, + "loss": 0.0164, "macro_f1": 0.5492662787437439, "num_tokens": 3457591.0, "repeat_count": 0.0, - "routers_loss": 0.03272393345832825, + "routers_loss": 0.03331060707569122, "skip_count": 2.0, "step": 2144, "text_loss": 0.3691073954105377 @@ -20385,13 +20385,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.058349609375, "learning_rate": 0.0009365784329704115, - "loss": 0.0191, + "loss": 0.0186, "macro_f1": 0.3333333432674408, "num_tokens": 3460895.0, "repeat_count": 0.0, - "routers_loss": 0.0017473002662882209, + "routers_loss": 0.0016955457394942641, "skip_count": 0.0, "step": 2146, "text_loss": 0.3947436511516571 @@ -20404,13 +20404,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.050537109375, "learning_rate": 0.0009364274788113651, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3464101.0, "repeat_count": 1.0, - "routers_loss": 0.008070237934589386, + "routers_loss": 0.006169239990413189, "skip_count": 0.0, "step": 2148, "text_loss": 0.3348555266857147 @@ -20423,13 +20423,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.068359375, "learning_rate": 0.0009362763574124858, - "loss": 0.0191, + "loss": 0.019, "macro_f1": 0.9265305995941162, "num_tokens": 3467417.0, "repeat_count": 3.0, - "routers_loss": 0.021709222346544266, + "routers_loss": 0.024033790454268456, "skip_count": 1.0, "step": 2150, "text_loss": 0.496633380651474 @@ -20442,13 +20442,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.042724609375, "learning_rate": 0.0009361250688316829, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3470917.0, "repeat_count": 0.0, - "routers_loss": 0.0022237664088606834, + "routers_loss": 0.0024986129719763994, "skip_count": 0.0, "step": 2152, "text_loss": 0.6857671737670898 @@ -20461,13 +20461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0546875, "learning_rate": 0.0009359736131269312, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 3473624.0, "repeat_count": 0.0, - "routers_loss": 0.00838750321418047, + "routers_loss": 0.008183322846889496, "skip_count": 1.0, "step": 2154, "text_loss": 0.13883116841316223 @@ -20480,13 +20480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.06640625, "learning_rate": 0.0009358219903562684, - "loss": 0.01, + "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3476472.0, "repeat_count": 0.0, - "routers_loss": 0.010190514847636223, + "routers_loss": 0.011198793537914753, "skip_count": 3.0, "step": 2156, "text_loss": 0.24243666231632233 @@ -20499,13 +20499,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.04296875, "learning_rate": 0.0009356702005777969, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3479688.0, "repeat_count": 0.0, - "routers_loss": 0.002411153633147478, + "routers_loss": 0.002520184963941574, "skip_count": 0.0, "step": 2158, "text_loss": 0.6407818794250488 @@ -20518,13 +20518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.0791015625, "learning_rate": 0.0009355182438496825, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 3482598.0, "repeat_count": 0.0, - "routers_loss": 0.001032356172800064, + "routers_loss": 0.0011065017897635698, "skip_count": 0.0, "step": 2160, "text_loss": 0.7214245796203613 @@ -20537,13 +20537,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009353661202301557, - "loss": 0.0147, + "loss": 0.0144, "macro_f1": 0.3333333432674408, "num_tokens": 3486271.0, "repeat_count": 0.0, - "routers_loss": 0.0022046815138310194, + "routers_loss": 0.0017824085662141442, "skip_count": 0.0, "step": 2162, "text_loss": 0.5140969157218933 @@ -20556,32 +20556,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.053466796875, "learning_rate": 0.0009352138297775101, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 3489206.0, "repeat_count": 0.0, - "routers_loss": 0.0014977266546338797, + "routers_loss": 0.001542879967018962, "skip_count": 0.0, "step": 2164, "text_loss": 0.7956416606903076 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 1.0, + "avg_layers": 25.0, "epoch": 10.169063692398003, - "f1_execute": 0.9803921580314636, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, + "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000935061372550104, - "loss": 0.0132, - "macro_f1": 0.5934640765190125, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, "num_tokens": 3492003.0, "repeat_count": 0.0, - "routers_loss": 0.016847684979438782, + "routers_loss": 0.01420794241130352, "skip_count": 3.0, "step": 2166, "text_loss": 0.27489882707595825 @@ -20594,13 +20594,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009349087486063594, - "loss": 0.0168, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3494784.0, "repeat_count": 0.0, - "routers_loss": 0.0036806222051382065, + "routers_loss": 0.003614309709519148, "skip_count": 1.0, "step": 2168, "text_loss": 0.2962227761745453 @@ -20613,13 +20613,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.09716796875, + "grad_norm": 0.1259765625, "learning_rate": 0.0009347559580047618, - "loss": 0.0174, + "loss": 0.0175, "macro_f1": 0.8814815282821655, "num_tokens": 3497886.0, "repeat_count": 2.0, - "routers_loss": 0.021412594243884087, + "routers_loss": 0.02122853323817253, "skip_count": 4.0, "step": 2170, "text_loss": 0.5919580459594727 @@ -20627,18 +20627,18 @@ { "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 10.197240974464338, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.06591796875, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, "learning_rate": 0.000934603000803861, - "loss": 0.0134, - "macro_f1": 0.6666666865348816, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, "num_tokens": 3500939.0, "repeat_count": 0.0, - "routers_loss": 0.0201424453407526, + "routers_loss": 0.02042219042778015, "skip_count": 1.0, "step": 2172, "text_loss": 0.28722381591796875 @@ -20651,13 +20651,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.0693359375, "learning_rate": 0.0009344498770622704, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3504852.0, "repeat_count": 0.0, - "routers_loss": 0.005059401970356703, + "routers_loss": 0.004345106892287731, "skip_count": 0.0, "step": 2174, "text_loss": 0.603236734867096 @@ -20670,13 +20670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.1064453125, "learning_rate": 0.0009342965868386673, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3508320.0, "repeat_count": 0.0, - "routers_loss": 0.004006600938737392, + "routers_loss": 0.00368050136603415, "skip_count": 0.0, "step": 2176, "text_loss": 0.6020491719245911 @@ -20691,11 +20691,11 @@ "f1_skip": 0.0, "grad_norm": 0.060302734375, "learning_rate": 0.000934143130191793, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 3511278.0, "repeat_count": 0.0, - "routers_loss": 0.013246738351881504, + "routers_loss": 0.013425769284367561, "skip_count": 0.0, "step": 2178, "text_loss": 0.5954724550247192 @@ -20708,13 +20708,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.060546875, "learning_rate": 0.000933989507180452, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.3333333432674408, "num_tokens": 3514361.0, "repeat_count": 0.0, - "routers_loss": 0.0031937146559357643, + "routers_loss": 0.002896249992772937, "skip_count": 0.0, "step": 2180, "text_loss": 0.39175131916999817 @@ -20727,13 +20727,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0556640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009338357178635135, - "loss": 0.0151, + "loss": 0.0147, "macro_f1": 0.6603773832321167, "num_tokens": 3517962.0, "repeat_count": 1.0, - "routers_loss": 0.014782631769776344, + "routers_loss": 0.011538350023329258, "skip_count": 1.0, "step": 2182, "text_loss": 0.4482830762863159 @@ -20746,13 +20746,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0009336817622999093, - "loss": 0.0112, + "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 3521299.0, "repeat_count": 1.0, - "routers_loss": 0.02318345196545124, + "routers_loss": 0.022787930443882942, "skip_count": 0.0, "step": 2184, "text_loss": 0.35177817940711975 @@ -20765,13 +20765,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.0634765625, "learning_rate": 0.0009335276405486357, - "loss": 0.0134, + "loss": 0.0139, "macro_f1": 0.3272727429866791, "num_tokens": 3524611.0, "repeat_count": 0.0, - "routers_loss": 0.011735675856471062, + "routers_loss": 0.011597735807299614, "skip_count": 1.0, "step": 2186, "text_loss": 0.24868851900100708 @@ -20784,13 +20784,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.11181640625, "learning_rate": 0.0009333733526687524, - "loss": 0.0198, + "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 3528012.0, "repeat_count": 0.0, - "routers_loss": 0.01558679062873125, + "routers_loss": 0.014253967441618443, "skip_count": 0.0, "step": 2188, "text_loss": 0.3970910310745239 @@ -20803,13 +20803,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.054931640625, "learning_rate": 0.000933218898719383, - "loss": 0.0163, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3530908.0, "repeat_count": 0.0, - "routers_loss": 0.0019149131840094924, + "routers_loss": 0.001659149187617004, "skip_count": 0.0, "step": 2190, "text_loss": 0.7618573307991028 @@ -20822,13 +20822,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.0693359375, "learning_rate": 0.0009330642787597141, - "loss": 0.0161, + "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 3533993.0, "repeat_count": 0.0, - "routers_loss": 0.0056966920383274555, + "routers_loss": 0.005574346985667944, "skip_count": 0.0, "step": 2192, "text_loss": 0.16470147669315338 @@ -20841,13 +20841,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0791015625, "learning_rate": 0.0009329094928489969, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3537310.0, "repeat_count": 0.0, - "routers_loss": 0.002511024009436369, + "routers_loss": 0.0026400673668831587, "skip_count": 0.0, "step": 2194, "text_loss": 0.3400416374206543 @@ -20860,13 +20860,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08935546875, + "grad_norm": 0.0849609375, "learning_rate": 0.0009327545410465452, - "loss": 0.0126, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3540045.0, "repeat_count": 0.0, - "routers_loss": 0.008584192954003811, + "routers_loss": 0.008448398672044277, "skip_count": 3.0, "step": 2196, "text_loss": 0.3110542297363281 @@ -20879,13 +20879,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.04638671875, "learning_rate": 0.0009325994234117372, - "loss": 0.0129, + "loss": 0.0122, "macro_f1": 0.32098764181137085, "num_tokens": 3544097.0, "repeat_count": 0.0, - "routers_loss": 0.03748156875371933, + "routers_loss": 0.037553198635578156, "skip_count": 2.0, "step": 2198, "text_loss": 0.36126700043678284 @@ -20898,13 +20898,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.09814453125, + "grad_norm": 0.09716796875, "learning_rate": 0.000932444140004014, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3547054.0, "repeat_count": 1.0, - "routers_loss": 0.006402099970728159, + "routers_loss": 0.006464479025453329, "skip_count": 0.0, "step": 2200, "text_loss": 0.4947047233581543 @@ -20917,13 +20917,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.158203125, + "grad_norm": 0.1015625, "learning_rate": 0.0009322886908828805, - "loss": 0.015, + "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3549903.0, "repeat_count": 1.0, - "routers_loss": 0.0055928584188222885, + "routers_loss": 0.005384812597185373, "skip_count": 0.0, "step": 2202, "text_loss": 0.5923738479614258 @@ -20936,13 +20936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0625, + "grad_norm": 0.0634765625, "learning_rate": 0.0009321330761079052, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3553745.0, "repeat_count": 0.0, - "routers_loss": 0.013155708089470863, + "routers_loss": 0.015346619300544262, "skip_count": 2.0, "step": 2204, "text_loss": 0.1904175877571106 @@ -20955,13 +20955,13 @@ "f1_execute": 0.9268292784690857, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.06884765625, + "grad_norm": 0.06494140625, "learning_rate": 0.00093197729573872, - "loss": 0.0206, + "loss": 0.0203, "macro_f1": 0.8422764539718628, "num_tokens": 3557235.0, "repeat_count": 3.0, - "routers_loss": 0.12029488384723663, + "routers_loss": 0.1207597479224205, "skip_count": 6.0, "step": 2206, "text_loss": 0.3904837667942047 @@ -20974,13 +20974,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0771484375, "learning_rate": 0.0009318213498350202, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3560795.0, "repeat_count": 0.0, - "routers_loss": 0.0037007431965321302, + "routers_loss": 0.003334777895361185, "skip_count": 0.0, "step": 2208, "text_loss": 0.4268290102481842 @@ -20993,13 +20993,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.0537109375, "learning_rate": 0.0009316652384565645, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3563754.0, "repeat_count": 0.0, - "routers_loss": 0.004071404226124287, + "routers_loss": 0.004230072256177664, "skip_count": 0.0, "step": 2210, "text_loss": 0.40049710869789124 @@ -21012,13 +21012,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.046875, "learning_rate": 0.0009315089616631751, - "loss": 0.0103, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 3567173.0, "repeat_count": 0.0, - "routers_loss": 0.0006955390563234687, + "routers_loss": 0.0006645230459980667, "skip_count": 0.0, "step": 2212, "text_loss": 0.42568323016166687 @@ -21031,32 +21031,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.07470703125, "learning_rate": 0.0009313525195147376, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3570831.0, "repeat_count": 0.0, - "routers_loss": 0.010293997824192047, + "routers_loss": 0.0097877848893404, "skip_count": 0.0, "step": 2214, "text_loss": 0.45808279514312744 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 10.40387437628412, - "f1_execute": 0.9583333134651184, - "f1_repeat": 1.0, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, - "grad_norm": 0.07470703125, + "grad_norm": 0.076171875, "learning_rate": 0.000931195912071201, - "loss": 0.0185, - "macro_f1": 0.8194444179534912, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, "num_tokens": 3573745.0, "repeat_count": 2.0, - "routers_loss": 0.06593773514032364, + "routers_loss": 0.07351134717464447, "skip_count": 3.0, "step": 2216, "text_loss": 0.285696804523468 @@ -21069,13 +21069,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.07666015625, "learning_rate": 0.0009310391393925775, - "loss": 0.013, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3576785.0, "repeat_count": 0.0, - "routers_loss": 0.00347105972468853, + "routers_loss": 0.0033160944003611803, "skip_count": 0.0, "step": 2218, "text_loss": 0.17516443133354187 @@ -21088,32 +21088,32 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04736328125, + "grad_norm": 0.047119140625, "learning_rate": 0.0009308822015389424, - "loss": 0.0244, + "loss": 0.0241, "macro_f1": 0.5427350401878357, "num_tokens": 3580695.0, "repeat_count": 1.0, - "routers_loss": 0.04871147498488426, + "routers_loss": 0.052930232137441635, "skip_count": 1.0, "step": 2220, "text_loss": 0.5918155908584595 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 26.0, + "acc_skip": 0.75, + "avg_layers": 25.0, "epoch": 10.432051658350455, - "f1_execute": 0.9600000381469727, + "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.05517578125, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, "learning_rate": 0.0009307250985704352, - "loss": 0.012, - "macro_f1": 0.542222261428833, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, "num_tokens": 3583729.0, "repeat_count": 0.0, - "routers_loss": 0.024859672412276268, + "routers_loss": 0.025454653427004814, "skip_count": 4.0, "step": 2222, "text_loss": 0.2652169466018677 @@ -21126,13 +21126,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.052001953125, "learning_rate": 0.0009305678305472575, - "loss": 0.016, + "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 3586775.0, "repeat_count": 0.0, - "routers_loss": 0.010990055277943611, + "routers_loss": 0.011279845610260963, "skip_count": 0.0, "step": 2224, "text_loss": 0.3511691987514496 @@ -21145,13 +21145,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10986328125, + "grad_norm": 0.10791015625, "learning_rate": 0.000930410397529675, - "loss": 0.0171, + "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 3589676.0, "repeat_count": 0.0, - "routers_loss": 0.0025031559634953737, + "routers_loss": 0.002700264798477292, "skip_count": 0.0, "step": 2226, "text_loss": 0.24045433104038239 @@ -21164,13 +21164,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.048095703125, "learning_rate": 0.000930252799578016, - "loss": 0.0147, + "loss": 0.0146, "macro_f1": 1.0, "num_tokens": 3593242.0, "repeat_count": 1.0, - "routers_loss": 0.008100497536361217, + "routers_loss": 0.00826631672680378, "skip_count": 2.0, "step": 2228, "text_loss": 0.3777645528316498 @@ -21183,13 +21183,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06396484375, "learning_rate": 0.0009300950367526728, - "loss": 0.0128, + "loss": 0.0131, "macro_f1": 0.8820862174034119, "num_tokens": 3596807.0, "repeat_count": 2.0, - "routers_loss": 0.03150207921862602, + "routers_loss": 0.036221496760845184, "skip_count": 2.0, "step": 2230, "text_loss": 0.502962589263916 @@ -21202,13 +21202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.0703125, "learning_rate": 0.0009299371091141001, - "loss": 0.0132, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3600150.0, "repeat_count": 0.0, - "routers_loss": 0.006253884173929691, + "routers_loss": 0.006449893582612276, "skip_count": 0.0, "step": 2232, "text_loss": 0.20256924629211426 @@ -21221,13 +21221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.04638671875, "learning_rate": 0.0009297790167228161, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3602988.0, "repeat_count": 0.0, - "routers_loss": 0.007228068076074123, + "routers_loss": 0.007872486487030983, "skip_count": 2.0, "step": 2234, "text_loss": 0.42476826906204224 @@ -21240,13 +21240,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009296207596394022, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 0.32098764181137085, "num_tokens": 3606071.0, "repeat_count": 0.0, - "routers_loss": 0.02524643763899803, + "routers_loss": 0.027397040277719498, "skip_count": 2.0, "step": 2236, "text_loss": 0.23432791233062744 @@ -21259,13 +21259,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.0595703125, "learning_rate": 0.0009294623379245028, - "loss": 0.0119, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3609389.0, "repeat_count": 0.0, - "routers_loss": 0.009672109968960285, + "routers_loss": 0.01042645052075386, "skip_count": 0.0, "step": 2238, "text_loss": 0.16665785014629364 @@ -21278,13 +21278,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.052490234375, "learning_rate": 0.0009293037516388252, - "loss": 0.0155, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3612105.0, "repeat_count": 0.0, - "routers_loss": 0.0010066524846479297, + "routers_loss": 0.0012458425480872393, "skip_count": 0.0, "step": 2240, "text_loss": 0.59421306848526 @@ -21297,13 +21297,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0732421875, + "grad_norm": 0.0751953125, "learning_rate": 0.0009291450008431404, - "loss": 0.0184, + "loss": 0.0185, "macro_f1": 1.0, "num_tokens": 3615439.0, "repeat_count": 1.0, - "routers_loss": 0.005509128328412771, + "routers_loss": 0.005781981628388166, "skip_count": 1.0, "step": 2242, "text_loss": 0.510798454284668 @@ -21316,13 +21316,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, - "grad_norm": 0.09423828125, + "grad_norm": 0.0966796875, "learning_rate": 0.0009289860855982814, - "loss": 0.0172, + "loss": 0.0166, "macro_f1": 0.4871794879436493, "num_tokens": 3618842.0, "repeat_count": 0.0, - "routers_loss": 0.030802007764577866, + "routers_loss": 0.031195320188999176, "skip_count": 3.0, "step": 2244, "text_loss": 0.7574363350868225 @@ -21335,13 +21335,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.04931640625, "learning_rate": 0.0009288270059651454, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 3621823.0, "repeat_count": 0.0, - "routers_loss": 0.001686889911070466, + "routers_loss": 0.001746491645462811, "skip_count": 0.0, "step": 2246, "text_loss": 0.5125683546066284 @@ -21354,13 +21354,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.1943359375, + "grad_norm": 0.220703125, "learning_rate": 0.0009286677620046918, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3624502.0, "repeat_count": 0.0, - "routers_loss": 0.03299177065491676, + "routers_loss": 0.03792348504066467, "skip_count": 2.0, "step": 2248, "text_loss": 0.7533677220344543 @@ -21375,11 +21375,11 @@ "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.0009285083537779429, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3627057.0, "repeat_count": 0.0, - "routers_loss": 0.0010354233672842383, + "routers_loss": 0.0009684451506473124, "skip_count": 0.0, "step": 2250, "text_loss": 0.2219279706478119 @@ -21392,13 +21392,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.10205078125, + "grad_norm": 0.11767578125, "learning_rate": 0.0009283487813459845, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.5492662787437439, "num_tokens": 3629720.0, "repeat_count": 0.0, - "routers_loss": 0.02196674607694149, + "routers_loss": 0.022757573053240776, "skip_count": 2.0, "step": 2252, "text_loss": 0.6903313994407654 @@ -21411,13 +21411,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.1201171875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009281890447699652, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 3633234.0, "repeat_count": 1.0, - "routers_loss": 0.002239946974441409, + "routers_loss": 0.003613058477640152, "skip_count": 0.0, "step": 2254, "text_loss": 0.6278893351554871 @@ -21430,13 +21430,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009280291441110961, - "loss": 0.0117, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3636289.0, "repeat_count": 0.0, - "routers_loss": 0.0063575254753232, + "routers_loss": 0.006214062683284283, "skip_count": 0.0, "step": 2256, "text_loss": 0.3011114001274109 @@ -21449,13 +21449,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.040283203125, + "grad_norm": 0.041015625, "learning_rate": 0.0009278690794306517, - "loss": 0.0143, + "loss": 0.014, "macro_f1": 0.5492662787437439, "num_tokens": 3640251.0, "repeat_count": 0.0, - "routers_loss": 0.0524379126727581, + "routers_loss": 0.052556321024894714, "skip_count": 2.0, "step": 2258, "text_loss": 0.19894185662269592 @@ -21468,13 +21468,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.08251953125, "learning_rate": 0.0009277088507899689, - "loss": 0.0156, + "loss": 0.0163, "macro_f1": 0.9452888369560242, "num_tokens": 3643527.0, "repeat_count": 4.0, - "routers_loss": 0.052486274391412735, + "routers_loss": 0.0572301521897316, "skip_count": 1.0, "step": 2260, "text_loss": 0.5593410134315491 @@ -21487,13 +21487,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.050537109375, "learning_rate": 0.0009275484582504475, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3646959.0, "repeat_count": 0.0, - "routers_loss": 0.006877690553665161, + "routers_loss": 0.008010074496269226, "skip_count": 0.0, "step": 2262, "text_loss": 0.2128177285194397 @@ -21506,13 +21506,13 @@ "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, - "grad_norm": 0.05322265625, + "grad_norm": 0.05419921875, "learning_rate": 0.0009273879018735505, - "loss": 0.0136, + "loss": 0.0138, "macro_f1": 0.8521739840507507, "num_tokens": 3651298.0, "repeat_count": 3.0, - "routers_loss": 0.03128742054104805, + "routers_loss": 0.035729870200157166, "skip_count": 3.0, "step": 2264, "text_loss": 0.2987811267375946 @@ -21525,13 +21525,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1689453125, + "grad_norm": 0.1474609375, "learning_rate": 0.0009272271817208031, - "loss": 0.0188, + "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 3655609.0, "repeat_count": 0.0, - "routers_loss": 0.0028425443451851606, + "routers_loss": 0.002379779238253832, "skip_count": 0.0, "step": 2266, "text_loss": 0.6024088263511658 @@ -21544,13 +21544,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009270662978537939, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3658444.0, "repeat_count": 0.0, - "routers_loss": 0.009712206199765205, + "routers_loss": 0.008943650871515274, "skip_count": 0.0, "step": 2268, "text_loss": 0.1741207242012024 @@ -21563,13 +21563,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.053955078125, "learning_rate": 0.0009269052503341736, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6595745086669922, "num_tokens": 3662282.0, "repeat_count": 1.0, - "routers_loss": 0.03980376198887825, + "routers_loss": 0.030201267451047897, "skip_count": 4.0, "step": 2270, "text_loss": 0.7300035953521729 @@ -21582,13 +21582,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.049072265625, "learning_rate": 0.0009267440392236562, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3665531.0, "repeat_count": 0.0, - "routers_loss": 0.0030603872146457434, + "routers_loss": 0.0026635683607310057, "skip_count": 0.0, "step": 2272, "text_loss": 0.31535038352012634 @@ -21601,13 +21601,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.0615234375, "learning_rate": 0.0009265826645840178, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 3668407.0, "repeat_count": 0.0, - "routers_loss": 0.004795679822564125, + "routers_loss": 0.004258926957845688, "skip_count": 0.0, "step": 2274, "text_loss": 0.7272579073905945 @@ -21620,13 +21620,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.1435546875, + "grad_norm": 0.125, "learning_rate": 0.0009264211264770976, - "loss": 0.0155, + "loss": 0.0154, "macro_f1": 0.6122449040412903, "num_tokens": 3671503.0, "repeat_count": 0.0, - "routers_loss": 0.0340447798371315, + "routers_loss": 0.038987524807453156, "skip_count": 4.0, "step": 2276, "text_loss": 0.7488982677459717 @@ -21639,13 +21639,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0009262594249647975, - "loss": 0.016, + "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 3674107.0, "repeat_count": 0.0, - "routers_loss": 0.007436402142047882, + "routers_loss": 0.007211760152131319, "skip_count": 1.0, "step": 2278, "text_loss": 0.1992369294166565 @@ -21658,13 +21658,13 @@ "f1_execute": 0.9767441749572754, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.056396484375, + "grad_norm": 0.0546875, "learning_rate": 0.0009260975601090815, - "loss": 0.0113, + "loss": 0.0112, "macro_f1": 0.9446290731430054, "num_tokens": 3677184.0, "repeat_count": 4.0, - "routers_loss": 0.02465176396071911, + "routers_loss": 0.02538592554628849, "skip_count": 3.0, "step": 2280, "text_loss": 0.46402135491371155 @@ -21677,13 +21677,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0654296875, "learning_rate": 0.0009259355319719768, - "loss": 0.0167, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 3680683.0, "repeat_count": 0.0, - "routers_loss": 0.0037910486571490765, + "routers_loss": 0.0038464947137981653, "skip_count": 0.0, "step": 2282, "text_loss": 0.5804527401924133 @@ -21696,13 +21696,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.130859375, + "grad_norm": 0.1611328125, "learning_rate": 0.0009257733406155726, - "loss": 0.0161, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3683928.0, "repeat_count": 0.0, - "routers_loss": 0.003716849023476243, + "routers_loss": 0.004841136280447245, "skip_count": 0.0, "step": 2284, "text_loss": 0.4834538400173187 @@ -21715,13 +21715,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.050048828125, "learning_rate": 0.0009256109861020212, - "loss": 0.0118, + "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3687101.0, "repeat_count": 0.0, - "routers_loss": 0.0021690395660698414, + "routers_loss": 0.002191900508478284, "skip_count": 0.0, "step": 2286, "text_loss": 0.8199604749679565 @@ -21734,13 +21734,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.08203125, + "grad_norm": 0.0927734375, "learning_rate": 0.000925448468493537, "loss": 0.0162, "macro_f1": 0.5427350401878357, "num_tokens": 3690490.0, "repeat_count": 1.0, - "routers_loss": 0.034040264785289764, + "routers_loss": 0.03488675877451897, "skip_count": 2.0, "step": 2288, "text_loss": 0.33263635635375977 @@ -21753,32 +21753,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.06640625, "learning_rate": 0.0009252857878523971, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3694109.0, "repeat_count": 1.0, - "routers_loss": 0.0027822356205433607, + "routers_loss": 0.002897309372201562, "skip_count": 0.0, "step": 2290, "text_loss": 0.47494807839393616 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 10.760786615791018, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0634765625, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, "learning_rate": 0.000925122944240941, - "loss": 0.0156, - "macro_f1": 0.5492662787437439, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, "num_tokens": 3697233.0, "repeat_count": 0.0, - "routers_loss": 0.020813947543501854, + "routers_loss": 0.01842675730586052, "skip_count": 2.0, "step": 2292, "text_loss": 0.14693495631217957 @@ -21791,13 +21791,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.042236328125, + "grad_norm": 0.045654296875, "learning_rate": 0.0009249599377215707, - "loss": 0.0145, + "loss": 0.0146, "macro_f1": 0.5866667032241821, "num_tokens": 3700376.0, "repeat_count": 1.0, - "routers_loss": 0.038725610822439194, + "routers_loss": 0.04169808700680733, "skip_count": 3.0, "step": 2294, "text_loss": 0.38051268458366394 @@ -21810,13 +21810,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.05908203125, "learning_rate": 0.0009247967683567507, - "loss": 0.0117, + "loss": 0.0112, "macro_f1": 0.3272727429866791, "num_tokens": 3703212.0, "repeat_count": 0.0, - "routers_loss": 0.01360203418880701, + "routers_loss": 0.012183113023638725, "skip_count": 1.0, "step": 2296, "text_loss": 0.23789077997207642 @@ -21829,13 +21829,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0498046875, + "grad_norm": 0.05712890625, "learning_rate": 0.0009246334362090077, - "loss": 0.0135, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3706490.0, "repeat_count": 1.0, - "routers_loss": 0.021909991279244423, + "routers_loss": 0.01880069635808468, "skip_count": 2.0, "step": 2298, "text_loss": 0.29067978262901306 @@ -21848,13 +21848,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0830078125, + "grad_norm": 0.08203125, "learning_rate": 0.000924469941340931, - "loss": 0.0175, + "loss": 0.0173, "macro_f1": 0.3272727429866791, "num_tokens": 3709804.0, "repeat_count": 1.0, - "routers_loss": 0.03153124824166298, + "routers_loss": 0.027359159663319588, "skip_count": 0.0, "step": 2300, "text_loss": 0.67828369140625 @@ -21867,13 +21867,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.07275390625, "learning_rate": 0.000924306283815172, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.3333333432674408, "num_tokens": 3712824.0, "repeat_count": 0.0, - "routers_loss": 0.0034419491421431303, + "routers_loss": 0.003152279881760478, "skip_count": 0.0, "step": 2302, "text_loss": 0.8333184719085693 @@ -21886,13 +21886,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.061767578125, + "grad_norm": 0.0703125, "learning_rate": 0.0009241424636944445, - "loss": 0.0163, + "loss": 0.0159, "macro_f1": 0.5492662787437439, "num_tokens": 3715385.0, "repeat_count": 0.0, - "routers_loss": 0.03655214607715607, + "routers_loss": 0.0442950464785099, "skip_count": 2.0, "step": 2304, "text_loss": 0.41893699765205383 @@ -21905,13 +21905,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0576171875, + "grad_norm": 0.058837890625, "learning_rate": 0.0009239784810415249, - "loss": 0.014, + "loss": 0.0137, "macro_f1": 0.8823530077934265, "num_tokens": 3719080.0, "repeat_count": 1.0, - "routers_loss": 0.015360959805548191, + "routers_loss": 0.015729321166872978, "skip_count": 2.0, "step": 2306, "text_loss": 0.13360483944416046 @@ -21924,13 +21924,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0537109375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009238143359192514, "loss": 0.0136, "macro_f1": 0.5934640765190125, "num_tokens": 3722439.0, "repeat_count": 0.0, - "routers_loss": 0.027275927364826202, + "routers_loss": 0.028816604986786842, "skip_count": 3.0, "step": 2308, "text_loss": 0.39594101905822754 @@ -21943,13 +21943,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0546875, + "grad_norm": 0.05419921875, "learning_rate": 0.000923650028390525, - "loss": 0.0163, + "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3725092.0, "repeat_count": 0.0, - "routers_loss": 0.003742894157767296, + "routers_loss": 0.0036455015651881695, "skip_count": 2.0, "step": 2310, "text_loss": 0.6169708371162415 @@ -21962,13 +21962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.09814453125, "learning_rate": 0.0009234855585183086, - "loss": 0.0135, + "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 3728412.0, "repeat_count": 0.0, - "routers_loss": 0.009356650523841381, + "routers_loss": 0.007565604057163, "skip_count": 1.0, "step": 2312, "text_loss": 0.21257059276103973 @@ -21983,11 +21983,11 @@ "f1_skip": 0.800000011920929, "grad_norm": 0.0517578125, "learning_rate": 0.0009233209263656273, - "loss": 0.0189, + "loss": 0.0184, "macro_f1": 0.9262410998344421, "num_tokens": 3731467.0, "repeat_count": 2.0, - "routers_loss": 0.02852487564086914, + "routers_loss": 0.02510629966855049, "skip_count": 3.0, "step": 2314, "text_loss": 0.21639840304851532 @@ -22000,13 +22000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05859375, + "grad_norm": 0.057861328125, "learning_rate": 0.0009231561319955684, - "loss": 0.0151, + "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 3734906.0, "repeat_count": 0.0, - "routers_loss": 0.007533316500484943, + "routers_loss": 0.00872227642685175, "skip_count": 0.0, "step": 2316, "text_loss": 0.35639774799346924 @@ -22019,13 +22019,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.09130859375, + "grad_norm": 0.08349609375, "learning_rate": 0.0009229911754712815, "loss": 0.0176, "macro_f1": 0.3333333432674408, "num_tokens": 3737943.0, "repeat_count": 0.0, - "routers_loss": 0.004666361026465893, + "routers_loss": 0.004695790819823742, "skip_count": 0.0, "step": 2318, "text_loss": 0.5269573330879211 @@ -22038,32 +22038,32 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.036376953125, "learning_rate": 0.0009228260568559781, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 3741833.0, "repeat_count": 1.0, - "routers_loss": 0.020992714911699295, + "routers_loss": 0.0217357836663723, "skip_count": 0.0, "step": 2320, "text_loss": 0.5110208988189697 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 10.901673026122689, - "f1_execute": 0.9811320900917053, + "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.1416015625, + "f1_skip": 0.0, + "grad_norm": 0.1953125, "learning_rate": 0.0009226607762129322, - "loss": 0.0204, - "macro_f1": 0.6603773832321167, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, "num_tokens": 3744642.0, "repeat_count": 1.0, - "routers_loss": 0.047016773372888565, + "routers_loss": 0.05595960095524788, "skip_count": 1.0, "step": 2322, "text_loss": 0.6291998624801636 @@ -22078,11 +22078,11 @@ "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0009224953336054796, - "loss": 0.0156, + "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 3748127.0, "repeat_count": 0.0, - "routers_loss": 0.006612313445657492, + "routers_loss": 0.0071634589694440365, "skip_count": 0.0, "step": 2324, "text_loss": 0.7404762506484985 @@ -22095,13 +22095,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.050537109375, "learning_rate": 0.000922329729097018, - "loss": 0.0164, + "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 3751373.0, "repeat_count": 0.0, - "routers_loss": 0.0012452995870262384, + "routers_loss": 0.0011676300782710314, "skip_count": 0.0, "step": 2326, "text_loss": 0.2915459871292114 @@ -22114,13 +22114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.061279296875, "learning_rate": 0.0009221639627510075, - "loss": 0.0128, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 3754518.0, "repeat_count": 0.0, - "routers_loss": 0.011379311792552471, + "routers_loss": 0.01039792038500309, "skip_count": 0.0, "step": 2328, "text_loss": 0.22066321969032288 @@ -22133,13 +22133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0791015625, + "grad_norm": 0.0751953125, "learning_rate": 0.0009219980346309702, - "loss": 0.0127, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3757621.0, "repeat_count": 0.0, - "routers_loss": 0.002973968628793955, + "routers_loss": 0.0032070958986878395, "skip_count": 0.0, "step": 2330, "text_loss": 0.5558560490608215 @@ -22152,13 +22152,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.076171875, "learning_rate": 0.0009218319448004899, - "loss": 0.012, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 3760885.0, "repeat_count": 0.0, - "routers_loss": 0.00768645154312253, + "routers_loss": 0.007085457909852266, "skip_count": 0.0, "step": 2332, "text_loss": 0.4348253607749939 @@ -22171,13 +22171,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1103515625, "learning_rate": 0.0009216656933232129, - "loss": 0.0167, + "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 3764462.0, "repeat_count": 0.0, - "routers_loss": 0.006761785596609116, + "routers_loss": 0.005504854489117861, "skip_count": 1.0, "step": 2334, "text_loss": 0.35828644037246704 @@ -22190,13 +22190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.05615234375, "learning_rate": 0.0009214992802628463, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3333333432674408, "num_tokens": 3767159.0, "repeat_count": 0.0, - "routers_loss": 0.0013711688807234168, + "routers_loss": 0.0013970810687169433, "skip_count": 0.0, "step": 2336, "text_loss": 0.2956557869911194 @@ -22209,13 +22209,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08154296875, + "grad_norm": 0.08203125, "learning_rate": 0.0009213327056831607, - "loss": 0.0174, + "loss": 0.0181, "macro_f1": 0.3272727429866791, "num_tokens": 3770408.0, "repeat_count": 0.0, - "routers_loss": 0.04009406641125679, + "routers_loss": 0.0427570566534996, "skip_count": 1.0, "step": 2338, "text_loss": 0.14883014559745789 @@ -22228,13 +22228,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.041015625, "learning_rate": 0.0009211659696479875, - "loss": 0.0095, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3773474.0, "repeat_count": 0.0, - "routers_loss": 0.0013272224459797144, + "routers_loss": 0.0011273405980318785, "skip_count": 0.0, "step": 2340, "text_loss": 0.26011669635772705 @@ -22249,11 +22249,11 @@ "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.00092099907222122, - "loss": 0.0145, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3776909.0, "repeat_count": 0.0, - "routers_loss": 0.001724833040498197, + "routers_loss": 0.0016178421210497618, "skip_count": 0.0, "step": 2342, "text_loss": 0.49078530073165894 @@ -22266,13 +22266,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.051025390625, "learning_rate": 0.000920832013466814, - "loss": 0.0132, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 3780741.0, "repeat_count": 0.0, - "routers_loss": 0.005641496740281582, + "routers_loss": 0.005510095041245222, "skip_count": 0.0, "step": 2344, "text_loss": 0.4870249927043915 @@ -22285,13 +22285,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.037109375, "learning_rate": 0.0009206647934487866, - "loss": 0.011, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3784673.0, "repeat_count": 1.0, - "routers_loss": 0.003907595761120319, + "routers_loss": 0.0047357892617583275, "skip_count": 0.0, "step": 2346, "text_loss": 0.3251725733280182 @@ -22304,13 +22304,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.05615234375, "learning_rate": 0.0009204974122312167, - "loss": 0.0141, + "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3787503.0, "repeat_count": 0.0, - "routers_loss": 0.007570050656795502, + "routers_loss": 0.00795028731226921, "skip_count": 1.0, "step": 2348, "text_loss": 0.18282145261764526 @@ -22323,13 +22323,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.060546875, "learning_rate": 0.0009203298698782452, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3790528.0, "repeat_count": 1.0, - "routers_loss": 0.0009280897793360054, + "routers_loss": 0.0009506374481134117, "skip_count": 0.0, "step": 2350, "text_loss": 0.4093080461025238 @@ -22342,13 +22342,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.047607421875, "learning_rate": 0.0009201621664540747, "loss": 0.0155, "macro_f1": 0.6666666865348816, "num_tokens": 3794134.0, "repeat_count": 1.0, - "routers_loss": 0.005288597662001848, + "routers_loss": 0.005159572698175907, "skip_count": 0.0, "step": 2352, "text_loss": 0.5451981425285339 @@ -22361,13 +22361,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.07666015625, "learning_rate": 0.0009199943020229694, - "loss": 0.0146, + "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 3797414.0, "repeat_count": 0.0, - "routers_loss": 0.002237799344584346, + "routers_loss": 0.002356168581172824, "skip_count": 0.0, "step": 2354, "text_loss": 0.3070453405380249 @@ -22380,13 +22380,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0810546875, "learning_rate": 0.0009198262766492554, - "loss": 0.0144, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 3800094.0, "repeat_count": 0.0, - "routers_loss": 0.006226782687008381, + "routers_loss": 0.0051761893555521965, "skip_count": 1.0, "step": 2356, "text_loss": 0.5880904197692871 @@ -22399,13 +22399,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.049560546875, "learning_rate": 0.00091965809039732, - "loss": 0.0136, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3803280.0, "repeat_count": 0.0, - "routers_loss": 0.0027645498048514128, + "routers_loss": 0.0025952060241252184, "skip_count": 0.0, "step": 2358, "text_loss": 0.5210731625556946 @@ -22418,13 +22418,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.06787109375, "learning_rate": 0.0009194897433316127, - "loss": 0.0122, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 3805866.0, "repeat_count": 0.0, - "routers_loss": 0.0034913592971861362, + "routers_loss": 0.0042560105212032795, "skip_count": 2.0, "step": 2360, "text_loss": 0.6472984552383423 @@ -22437,13 +22437,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07568359375, "learning_rate": 0.0009193212355166446, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3808952.0, "repeat_count": 0.0, - "routers_loss": 0.002706601284444332, + "routers_loss": 0.0026232977397739887, "skip_count": 0.0, "step": 2362, "text_loss": 0.450063556432724 @@ -22456,13 +22456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06689453125, "learning_rate": 0.0009191525670169881, - "loss": 0.0108, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3812080.0, "repeat_count": 0.0, - "routers_loss": 0.0032696903217583895, + "routers_loss": 0.0034355956595391035, "skip_count": 0.0, "step": 2364, "text_loss": 0.49727216362953186 @@ -22475,13 +22475,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.05908203125, "learning_rate": 0.000918983737897277, - "loss": 0.0115, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 3815282.0, "repeat_count": 0.0, - "routers_loss": 0.006245410069823265, + "routers_loss": 0.0055653867311775684, "skip_count": 1.0, "step": 2366, "text_loss": 0.6336377859115601 @@ -22496,11 +22496,11 @@ "f1_skip": 1.0, "grad_norm": 0.033447265625, "learning_rate": 0.0009188147482222071, - "loss": 0.0079, + "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3818106.0, "repeat_count": 2.0, - "routers_loss": 0.011230813339352608, + "routers_loss": 0.011016021482646465, "skip_count": 2.0, "step": 2368, "text_loss": 0.22513329982757568 @@ -22515,11 +22515,11 @@ "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.0009186455980565358, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3821228.0, "repeat_count": 1.0, - "routers_loss": 0.014897257089614868, + "routers_loss": 0.014039464294910431, "skip_count": 0.0, "step": 2370, "text_loss": 0.21331638097763062 @@ -22532,13 +22532,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.062255859375, "learning_rate": 0.0009184762874650816, - "loss": 0.0131, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 3825048.0, "repeat_count": 0.0, - "routers_loss": 0.0015503648901358247, + "routers_loss": 0.001088051125407219, "skip_count": 0.0, "step": 2372, "text_loss": 0.6031543612480164 @@ -22551,13 +22551,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.091796875, + "grad_norm": 0.095703125, "learning_rate": 0.0009183068165127245, - "loss": 0.0127, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 3828781.0, "repeat_count": 0.0, - "routers_loss": 0.00723480898886919, + "routers_loss": 0.006263940595090389, "skip_count": 1.0, "step": 2374, "text_loss": 0.6249601244926453 @@ -22570,13 +22570,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.06982421875, "learning_rate": 0.0009181371852644062, - "loss": 0.0139, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 3832507.0, "repeat_count": 1.0, - "routers_loss": 0.002053398173302412, + "routers_loss": 0.001987969037145376, "skip_count": 0.0, "step": 2376, "text_loss": 0.37972065806388855 @@ -22589,32 +22589,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.0908203125, "learning_rate": 0.0009179673937851299, "loss": 0.0158, "macro_f1": 0.6666666865348816, "num_tokens": 3835644.0, "repeat_count": 0.0, - "routers_loss": 0.007927518337965012, + "routers_loss": 0.007635094691067934, "skip_count": 1.0, "step": 2378, "text_loss": 0.46319663524627686 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 28.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.173759906075727, "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.06298828125, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, "learning_rate": 0.0009177974421399598, - "loss": 0.0144, - "macro_f1": 0.5555555820465088, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, "num_tokens": 3838700.0, "repeat_count": 0.0, - "routers_loss": 0.01924682781100273, + "routers_loss": 0.01617279462516308, "skip_count": 2.0, "step": 2380, "text_loss": 0.32141056656837463 @@ -22627,13 +22627,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046875, + "grad_norm": 0.056396484375, "learning_rate": 0.0009176273303940217, - "loss": 0.0106, + "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 3841953.0, "repeat_count": 0.0, - "routers_loss": 0.0021689811255782843, + "routers_loss": 0.0022273799404501915, "skip_count": 2.0, "step": 2382, "text_loss": 0.5908139944076538 @@ -22646,13 +22646,13 @@ "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.0615234375, "learning_rate": 0.0009174570586125026, - "loss": 0.0119, + "loss": 0.0122, "macro_f1": 0.32098767161369324, "num_tokens": 3845763.0, "repeat_count": 1.0, - "routers_loss": 0.03431013971567154, + "routers_loss": 0.030915161594748497, "skip_count": 0.0, "step": 2384, "text_loss": 0.41400137543678284 @@ -22665,13 +22665,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009172866268606513, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3848984.0, "repeat_count": 0.0, - "routers_loss": 0.008275258354842663, + "routers_loss": 0.010480951517820358, "skip_count": 2.0, "step": 2386, "text_loss": 0.2560874819755554 @@ -22684,13 +22684,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04736328125, + "grad_norm": 0.056396484375, "learning_rate": 0.0009171160352037775, - "loss": 0.0121, + "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 3852118.0, "repeat_count": 0.0, - "routers_loss": 0.007780806161463261, + "routers_loss": 0.00809961836785078, "skip_count": 1.0, "step": 2388, "text_loss": 0.28236693143844604 @@ -22709,7 +22709,7 @@ "macro_f1": 1.0, "num_tokens": 3855314.0, "repeat_count": 1.0, - "routers_loss": 0.00553786288946867, + "routers_loss": 0.005569872446358204, "skip_count": 1.0, "step": 2390, "text_loss": 0.4578137695789337 @@ -22722,13 +22722,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.1123046875, "learning_rate": 0.0009167743724365073, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 3858301.0, "repeat_count": 0.0, - "routers_loss": 0.004066115710884333, + "routers_loss": 0.0038610948249697685, "skip_count": 1.0, "step": 2392, "text_loss": 0.14082716405391693 @@ -22741,13 +22741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.1376953125, "learning_rate": 0.0009166033014570368, - "loss": 0.0104, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3861296.0, "repeat_count": 0.0, - "routers_loss": 0.002403446938842535, + "routers_loss": 0.0017607157351449132, "skip_count": 0.0, "step": 2394, "text_loss": 0.384442001581192 @@ -22760,13 +22760,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.051025390625, "learning_rate": 0.0009164320708343954, - "loss": 0.0137, + "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 3863985.0, "repeat_count": 2.0, - "routers_loss": 0.010212135501205921, + "routers_loss": 0.009627950377762318, "skip_count": 0.0, "step": 2396, "text_loss": 0.6969521045684814 @@ -22779,13 +22779,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.07666015625, "learning_rate": 0.0009162606806341989, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 3866636.0, "repeat_count": 0.0, - "routers_loss": 0.007781816180795431, + "routers_loss": 0.006915586534887552, "skip_count": 0.0, "step": 2398, "text_loss": 0.48069697618484497 @@ -22798,32 +22798,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.0009160891309221242, - "loss": 0.0151, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 3870867.0, "repeat_count": 1.0, - "routers_loss": 0.0016227158484980464, + "routers_loss": 0.0013031222624704242, "skip_count": 0.0, "step": 2400, "text_loss": 0.3882075846195221 }, { "acc_repeat": 0.5, - "acc_skip": 1.0, - "avg_layers": 28.0, + "acc_skip": 0.0, + "avg_layers": 29.0, "epoch": 11.277076606985618, - "f1_execute": 0.9803921580314636, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, - "f1_skip": 1.0, - "grad_norm": 0.06298828125, + "f1_skip": 0.0, + "grad_norm": 0.06640625, "learning_rate": 0.0009159174217639096, - "loss": 0.0114, - "macro_f1": 0.8823530077934265, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, "num_tokens": 3873663.0, "repeat_count": 2.0, - "routers_loss": 0.06490851938724518, + "routers_loss": 0.06621067970991135, "skip_count": 1.0, "step": 2402, "text_loss": 0.5740041136741638 @@ -22836,13 +22836,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.03662109375, "learning_rate": 0.0009157455532253547, - "loss": 0.0075, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3876788.0, "repeat_count": 1.0, - "routers_loss": 0.007105287164449692, + "routers_loss": 0.005957918707281351, "skip_count": 0.0, "step": 2404, "text_loss": 0.26025933027267456 @@ -22855,13 +22855,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.06787109375, + "grad_norm": 0.08642578125, "learning_rate": 0.0009155735253723191, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.9452888369560242, "num_tokens": 3879942.0, "repeat_count": 1.0, - "routers_loss": 0.03736003860831261, + "routers_loss": 0.039429809898138046, "skip_count": 4.0, "step": 2406, "text_loss": 1.1349908113479614 @@ -22874,13 +22874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.047607421875, "learning_rate": 0.0009154013382707251, - "loss": 0.011, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 3882682.0, "repeat_count": 0.0, - "routers_loss": 0.0012925176415592432, + "routers_loss": 0.0012570557883009315, "skip_count": 0.0, "step": 2408, "text_loss": 0.5611135363578796 @@ -22895,11 +22895,11 @@ "f1_skip": 0.0, "grad_norm": 0.034423828125, "learning_rate": 0.0009152289919865543, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3886425.0, "repeat_count": 0.0, - "routers_loss": 0.001746711554005742, + "routers_loss": 0.0017455556662753224, "skip_count": 0.0, "step": 2410, "text_loss": 0.7523751854896545 @@ -22912,13 +22912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.04052734375, "learning_rate": 0.0009150564865858506, - "loss": 0.0112, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3889273.0, "repeat_count": 0.0, - "routers_loss": 0.011005193926393986, + "routers_loss": 0.011178011074662209, "skip_count": 1.0, "step": 2412, "text_loss": 0.26942551136016846 @@ -22931,13 +22931,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.07373046875, "learning_rate": 0.0009148838221347182, - "loss": 0.0102, + "loss": 0.0107, "macro_f1": 0.5934640765190125, "num_tokens": 3892199.0, "repeat_count": 3.0, - "routers_loss": 0.017795369029045105, + "routers_loss": 0.019628092646598816, "skip_count": 0.0, "step": 2414, "text_loss": 0.5492315888404846 @@ -22950,13 +22950,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.04541015625, "learning_rate": 0.0009147109986993225, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3895362.0, "repeat_count": 1.0, - "routers_loss": 0.011693861335515976, + "routers_loss": 0.012255983427166939, "skip_count": 0.0, "step": 2416, "text_loss": 0.23798216879367828 @@ -22969,13 +22969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1259765625, + "grad_norm": 0.11669921875, "learning_rate": 0.0009145380163458899, - "loss": 0.0177, + "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 3898476.0, "repeat_count": 0.0, - "routers_loss": 0.007135285064578056, + "routers_loss": 0.007018954027444124, "skip_count": 0.0, "step": 2418, "text_loss": 0.1923145055770874 @@ -22988,13 +22988,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.03369140625, "learning_rate": 0.0009143648751407074, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3901817.0, "repeat_count": 0.0, - "routers_loss": 0.0008607010240666568, + "routers_loss": 0.0008574824314564466, "skip_count": 0.0, "step": 2420, "text_loss": 0.4001806974411011 @@ -23007,13 +23007,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07861328125, + "grad_norm": 0.11328125, "learning_rate": 0.0009141915751501231, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.5492662787437439, "num_tokens": 3905461.0, "repeat_count": 0.0, - "routers_loss": 0.015359465964138508, + "routers_loss": 0.01572350226342678, "skip_count": 2.0, "step": 2422, "text_loss": 0.19519129395484924 @@ -23026,13 +23026,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037353515625, "learning_rate": 0.0009140181164405458, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 3908878.0, "repeat_count": 0.0, - "routers_loss": 0.00047823251225054264, + "routers_loss": 0.0005503420252352953, "skip_count": 0.0, "step": 2424, "text_loss": 0.6937088370323181 @@ -23047,11 +23047,11 @@ "f1_skip": 0.0, "grad_norm": 0.068359375, "learning_rate": 0.0009138444990784454, - "loss": 0.0129, + "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3912053.0, "repeat_count": 0.0, - "routers_loss": 0.0070601715706288815, + "routers_loss": 0.007556677330285311, "skip_count": 0.0, "step": 2426, "text_loss": 0.35431069135665894 @@ -23064,13 +23064,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0947265625, + "grad_norm": 0.06201171875, "learning_rate": 0.000913670723130352, - "loss": 0.0123, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 3915192.0, "repeat_count": 0.0, - "routers_loss": 0.0010537977796047926, + "routers_loss": 0.0013609991874545813, "skip_count": 0.0, "step": 2428, "text_loss": 0.5171207189559937 @@ -23083,13 +23083,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.050048828125, "learning_rate": 0.0009134967886628573, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 3917927.0, "repeat_count": 2.0, - "routers_loss": 0.012852456420660019, + "routers_loss": 0.010895746760070324, "skip_count": 2.0, "step": 2430, "text_loss": 0.2852934002876282 @@ -23102,13 +23102,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.062255859375, "learning_rate": 0.0009133226957426133, - "loss": 0.0134, + "loss": 0.0132, "macro_f1": 0.5492662787437439, "num_tokens": 3921460.0, "repeat_count": 2.0, - "routers_loss": 0.05307198315858841, + "routers_loss": 0.04196908697485924, "skip_count": 0.0, "step": 2432, "text_loss": 0.4864770770072937 @@ -23121,13 +23121,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1015625, + "grad_norm": 0.1025390625, "learning_rate": 0.0009131484444363324, - "loss": 0.0154, + "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3924662.0, "repeat_count": 0.0, - "routers_loss": 0.004656757228076458, + "routers_loss": 0.004484197124838829, "skip_count": 0.0, "step": 2434, "text_loss": 0.7568684220314026 @@ -23140,13 +23140,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.05078125, "learning_rate": 0.0009129740348107882, - "loss": 0.0113, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 3927337.0, "repeat_count": 0.0, - "routers_loss": 0.0042406003922224045, + "routers_loss": 0.004351360257714987, "skip_count": 2.0, "step": 2436, "text_loss": 0.5953161716461182 @@ -23159,13 +23159,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0517578125, + "grad_norm": 0.04736328125, "learning_rate": 0.0009127994669328151, - "loss": 0.0089, + "loss": 0.0085, "macro_f1": 0.6122449040412903, "num_tokens": 3930407.0, "repeat_count": 0.0, - "routers_loss": 0.018079286441206932, + "routers_loss": 0.01664198748767376, "skip_count": 4.0, "step": 2438, "text_loss": 0.5320524573326111 @@ -23178,13 +23178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0595703125, "learning_rate": 0.0009126247408693071, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3933184.0, "repeat_count": 0.0, - "routers_loss": 0.002266801195219159, + "routers_loss": 0.0017819046042859554, "skip_count": 1.0, "step": 2440, "text_loss": 0.6051273345947266 @@ -23197,13 +23197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06640625, "learning_rate": 0.0009124498566872204, - "loss": 0.01, + "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3936620.0, "repeat_count": 0.0, - "routers_loss": 0.005790423136204481, + "routers_loss": 0.005519696045666933, "skip_count": 0.0, "step": 2442, "text_loss": 0.12987950444221497 @@ -23216,13 +23216,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052734375, + "grad_norm": 0.052490234375, "learning_rate": 0.0009122748144535704, - "loss": 0.011, + "loss": 0.0111, "macro_f1": 0.32098764181137085, "num_tokens": 3940010.0, "repeat_count": 0.0, - "routers_loss": 0.04591076448559761, + "routers_loss": 0.04543351009488106, "skip_count": 2.0, "step": 2444, "text_loss": 0.4642033576965332 @@ -23235,13 +23235,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.04296875, "learning_rate": 0.0009120996142354338, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3943135.0, "repeat_count": 0.0, - "routers_loss": 0.004969341680407524, + "routers_loss": 0.00550565542653203, "skip_count": 0.0, "step": 2446, "text_loss": 0.5697627067565918 @@ -23254,13 +23254,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0009119242560999477, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3946650.0, "repeat_count": 0.0, - "routers_loss": 0.00830315612256527, + "routers_loss": 0.008842485956847668, "skip_count": 0.0, "step": 2448, "text_loss": 0.17046524584293365 @@ -23273,13 +23273,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.08154296875, "learning_rate": 0.0009117487401143095, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 3949470.0, "repeat_count": 1.0, - "routers_loss": 0.0059144929982721806, + "routers_loss": 0.005900127813220024, "skip_count": 0.0, "step": 2450, "text_loss": 0.37260866165161133 @@ -23292,13 +23292,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.035400390625, "learning_rate": 0.0009115730663457773, - "loss": 0.0132, + "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3952546.0, "repeat_count": 1.0, - "routers_loss": 0.0029762545600533485, + "routers_loss": 0.003409258322790265, "skip_count": 1.0, "step": 2452, "text_loss": 0.5308008193969727 @@ -23311,13 +23311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.05224609375, "learning_rate": 0.0009113972348616698, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 3955817.0, "repeat_count": 0.0, - "routers_loss": 0.011962058953940868, + "routers_loss": 0.010098597034811974, "skip_count": 1.0, "step": 2454, "text_loss": 0.39226648211479187 @@ -23330,13 +23330,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1474609375, + "grad_norm": 0.1640625, "learning_rate": 0.0009112212457293658, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 3958911.0, "repeat_count": 0.0, - "routers_loss": 0.07289884239435196, + "routers_loss": 0.08184818178415298, "skip_count": 0.0, "step": 2456, "text_loss": 0.45411455631256104 @@ -23349,13 +23349,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041259765625, "learning_rate": 0.0009110450990163047, - "loss": 0.0124, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 3962584.0, "repeat_count": 0.0, - "routers_loss": 0.0009638209594413638, + "routers_loss": 0.0009352223132736981, "skip_count": 0.0, "step": 2458, "text_loss": 0.47292324900627136 @@ -23368,13 +23368,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0009108687947899863, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3965597.0, "repeat_count": 1.0, - "routers_loss": 0.008587516844272614, + "routers_loss": 0.008150188252329826, "skip_count": 2.0, "step": 2460, "text_loss": 0.33208340406417847 @@ -23387,13 +23387,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04150390625, + "grad_norm": 0.043212890625, "learning_rate": 0.0009106923331179707, - "loss": 0.0126, + "loss": 0.0125, "macro_f1": 0.5492662787437439, "num_tokens": 3968664.0, "repeat_count": 0.0, - "routers_loss": 0.05080332234501839, + "routers_loss": 0.050999004393815994, "skip_count": 2.0, "step": 2462, "text_loss": 0.2459995150566101 @@ -23406,13 +23406,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.0693359375, "learning_rate": 0.0009105157140678782, - "loss": 0.0124, + "loss": 0.0126, "macro_f1": 0.6666666865348816, "num_tokens": 3971772.0, "repeat_count": 0.0, - "routers_loss": 0.007348654326051474, + "routers_loss": 0.006196586415171623, "skip_count": 1.0, "step": 2464, "text_loss": 0.23956991732120514 @@ -23425,13 +23425,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009103389377073896, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 3976224.0, "repeat_count": 0.0, - "routers_loss": 0.007161752786487341, + "routers_loss": 0.008181816898286343, "skip_count": 0.0, "step": 2466, "text_loss": 0.3235875070095062 @@ -23444,13 +23444,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.057373046875, "learning_rate": 0.0009101620041042462, - "loss": 0.0119, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3978876.0, "repeat_count": 0.0, - "routers_loss": 0.0015090530505403876, + "routers_loss": 0.0015451472718268633, "skip_count": 0.0, "step": 2468, "text_loss": 0.4038759469985962 @@ -23463,13 +23463,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.09130859375, "learning_rate": 0.000909984913326249, - "loss": 0.0129, + "loss": 0.0131, "macro_f1": 0.3272727429866791, "num_tokens": 3981992.0, "repeat_count": 0.0, - "routers_loss": 0.021420184522867203, + "routers_loss": 0.021785033866763115, "skip_count": 1.0, "step": 2470, "text_loss": 0.6346460580825806 @@ -23482,13 +23482,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0712890625, "learning_rate": 0.0009098076654412595, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3984560.0, "repeat_count": 0.0, - "routers_loss": 0.0010742908343672752, + "routers_loss": 0.0011462471447885036, "skip_count": 0.0, "step": 2472, "text_loss": 0.3449646532535553 @@ -23501,13 +23501,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05078125, + "grad_norm": 0.049560546875, "learning_rate": 0.0009096302605171996, - "loss": 0.011, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3987548.0, "repeat_count": 0.0, - "routers_loss": 0.0015209210105240345, + "routers_loss": 0.0014367027906700969, "skip_count": 0.0, "step": 2474, "text_loss": 0.5918350219726562 @@ -23520,13 +23520,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0478515625, "learning_rate": 0.0009094526986220513, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3990727.0, "repeat_count": 0.0, - "routers_loss": 0.0008761848439462483, + "routers_loss": 0.0008977655088528991, "skip_count": 0.0, "step": 2476, "text_loss": 0.463350385427475 @@ -23539,13 +23539,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.049072265625, "learning_rate": 0.0009092749798238563, - "loss": 0.0146, + "loss": 0.015, "macro_f1": 0.3272727429866791, "num_tokens": 3993757.0, "repeat_count": 1.0, - "routers_loss": 0.01623794063925743, + "routers_loss": 0.016712551936507225, "skip_count": 0.0, "step": 2478, "text_loss": 0.5621229410171509 @@ -23558,13 +23558,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07080078125, + "grad_norm": 0.06640625, "learning_rate": 0.000909097104190717, - "loss": 0.0174, + "loss": 0.0172, "macro_f1": 0.32098764181137085, "num_tokens": 3997259.0, "repeat_count": 0.0, - "routers_loss": 0.04170118644833565, + "routers_loss": 0.04134179651737213, "skip_count": 2.0, "step": 2480, "text_loss": 0.375476598739624 @@ -23577,32 +23577,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.044677734375, "learning_rate": 0.0009089190717907956, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4000563.0, "repeat_count": 0.0, - "routers_loss": 0.003591755870729685, + "routers_loss": 0.003462378401309252, "skip_count": 0.0, "step": 2482, "text_loss": 0.5553798675537109 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 11.66216612855885, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.0693359375, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, "learning_rate": 0.0009087408826923146, - "loss": 0.0185, - "macro_f1": 0.5492662787437439, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, "num_tokens": 4004065.0, "repeat_count": 0.0, - "routers_loss": 0.009214848279953003, + "routers_loss": 0.008057428523898125, "skip_count": 2.0, "step": 2484, "text_loss": 0.4329465329647064 @@ -23615,13 +23615,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.050048828125, "learning_rate": 0.0009085625369635564, - "loss": 0.0111, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4007119.0, "repeat_count": 0.0, - "routers_loss": 0.0059350160881876945, + "routers_loss": 0.005759050603955984, "skip_count": 0.0, "step": 2486, "text_loss": 0.501268744468689 @@ -23634,13 +23634,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10693359375, + "grad_norm": 0.1240234375, "learning_rate": 0.0009083840346728631, - "loss": 0.0118, + "loss": 0.0122, "macro_f1": 0.3272727429866791, "num_tokens": 4010547.0, "repeat_count": 1.0, - "routers_loss": 0.019803427159786224, + "routers_loss": 0.020763102918863297, "skip_count": 0.0, "step": 2488, "text_loss": 0.480196475982666 @@ -23653,13 +23653,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.058349609375, + "grad_norm": 0.05078125, "learning_rate": 0.0009082053758886374, - "loss": 0.0118, + "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 4014600.0, "repeat_count": 0.0, - "routers_loss": 0.006243673153221607, + "routers_loss": 0.005801836494356394, "skip_count": 1.0, "step": 2490, "text_loss": 0.18249782919883728 @@ -23672,13 +23672,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0009080265606793416, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 4017964.0, "repeat_count": 1.0, - "routers_loss": 0.003960726782679558, + "routers_loss": 0.004226063843816519, "skip_count": 1.0, "step": 2492, "text_loss": 0.6573076248168945 @@ -23691,13 +23691,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.049072265625, "learning_rate": 0.000907847589113498, - "loss": 0.0127, + "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 4020694.0, "repeat_count": 0.0, - "routers_loss": 0.004959117621183395, + "routers_loss": 0.004281101748347282, "skip_count": 2.0, "step": 2494, "text_loss": 0.3944586217403412 @@ -23710,13 +23710,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.061279296875, "learning_rate": 0.000907668461259689, - "loss": 0.0157, + "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 4023757.0, "repeat_count": 0.0, - "routers_loss": 0.009721433743834496, + "routers_loss": 0.008786370046436787, "skip_count": 1.0, "step": 2496, "text_loss": 0.6452898979187012 @@ -23729,13 +23729,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0693359375, "learning_rate": 0.0009074891771865566, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4026601.0, "repeat_count": 0.0, - "routers_loss": 0.00491701066493988, + "routers_loss": 0.005209595896303654, "skip_count": 0.0, "step": 2498, "text_loss": 0.9633619785308838 @@ -23748,13 +23748,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.03759765625, "learning_rate": 0.0009073097369628028, - "loss": 0.0131, + "loss": 0.013, "macro_f1": 1.0, "num_tokens": 4030321.0, "repeat_count": 3.0, - "routers_loss": 0.009832080453634262, + "routers_loss": 0.00860709697008133, "skip_count": 1.0, "step": 2500, "text_loss": 0.48566827178001404 @@ -23767,13 +23767,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.04443359375, "learning_rate": 0.0009071301406571893, - "loss": 0.0137, + "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4033234.0, "repeat_count": 0.0, - "routers_loss": 0.003301833290606737, + "routers_loss": 0.0035277456045150757, "skip_count": 0.0, "step": 2502, "text_loss": 0.3771554231643677 @@ -23786,13 +23786,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.000906950388338538, - "loss": 0.0134, + "loss": 0.0136, "macro_f1": 0.3333333432674408, "num_tokens": 4036417.0, "repeat_count": 0.0, - "routers_loss": 0.001580960932187736, + "routers_loss": 0.0013424850767478347, "skip_count": 0.0, "step": 2504, "text_loss": 0.8962806463241577 @@ -23805,13 +23805,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0908203125, + "grad_norm": 0.09912109375, "learning_rate": 0.0009067704800757301, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4039564.0, "repeat_count": 0.0, - "routers_loss": 0.0011505817528814077, + "routers_loss": 0.0010423909407109022, "skip_count": 0.0, "step": 2506, "text_loss": 0.43170279264450073 @@ -23824,13 +23824,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.04248046875, "learning_rate": 0.000906590415937707, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 4043212.0, "repeat_count": 0.0, - "routers_loss": 0.023224346339702606, + "routers_loss": 0.021780289709568024, "skip_count": 1.0, "step": 2508, "text_loss": 0.41495826840400696 @@ -23843,13 +23843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.0341796875, "learning_rate": 0.0009064101959934696, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4046687.0, "repeat_count": 0.0, - "routers_loss": 0.007955167442560196, + "routers_loss": 0.007261929102241993, "skip_count": 1.0, "step": 2510, "text_loss": 0.21821187436580658 @@ -23862,13 +23862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.057861328125, "learning_rate": 0.0009062298203120783, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4050735.0, "repeat_count": 0.0, - "routers_loss": 0.006164440419524908, + "routers_loss": 0.007447180338203907, "skip_count": 2.0, "step": 2512, "text_loss": 0.1818767935037613 @@ -23881,13 +23881,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.06494140625, "learning_rate": 0.0009060492889626535, - "loss": 0.014, + "loss": 0.0142, "macro_f1": 0.3272727429866791, "num_tokens": 4054426.0, "repeat_count": 1.0, - "routers_loss": 0.0713663101196289, + "routers_loss": 0.0718490406870842, "skip_count": 0.0, "step": 2514, "text_loss": 0.22798970341682434 @@ -23900,13 +23900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08544921875, + "grad_norm": 0.099609375, "learning_rate": 0.0009058686020143753, - "loss": 0.0182, + "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 4057615.0, "repeat_count": 0.0, - "routers_loss": 0.0052308146841824055, + "routers_loss": 0.0052676633931696415, "skip_count": 0.0, "step": 2516, "text_loss": 0.1712338626384735 @@ -23919,13 +23919,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.0380859375, "learning_rate": 0.0009056877595364832, - "loss": 0.0143, + "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 4060338.0, "repeat_count": 0.0, - "routers_loss": 0.0020465939305722713, + "routers_loss": 0.0018052728846669197, "skip_count": 0.0, "step": 2518, "text_loss": 0.6811438798904419 @@ -23938,13 +23938,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.064453125, + "grad_norm": 0.083984375, "learning_rate": 0.0009055067615982761, - "loss": 0.0114, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4062887.0, "repeat_count": 0.0, - "routers_loss": 0.0008663221378810704, + "routers_loss": 0.0009029926732182503, "skip_count": 0.0, "step": 2520, "text_loss": 0.5480356812477112 @@ -23957,13 +23957,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.051025390625, "learning_rate": 0.0009053256082691133, - "loss": 0.0104, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 4065357.0, "repeat_count": 0.0, - "routers_loss": 0.0026889131404459476, + "routers_loss": 0.0027515271212905645, "skip_count": 0.0, "step": 2522, "text_loss": 0.5234101414680481 @@ -23978,11 +23978,11 @@ "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0009051442996184127, - "loss": 0.0181, + "loss": 0.0174, "macro_f1": 0.3333333432674408, "num_tokens": 4068111.0, "repeat_count": 0.0, - "routers_loss": 0.002255887258797884, + "routers_loss": 0.002199822571128607, "skip_count": 0.0, "step": 2524, "text_loss": 0.2418575882911682 @@ -23995,13 +23995,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060546875, + "grad_norm": 0.0625, "learning_rate": 0.0009049628357156521, - "loss": 0.0144, + "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 4071284.0, "repeat_count": 0.0, - "routers_loss": 0.005672316066920757, + "routers_loss": 0.006303096655756235, "skip_count": 2.0, "step": 2526, "text_loss": 0.7948065996170044 @@ -24014,13 +24014,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.037841796875, "learning_rate": 0.000904781216630369, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6601307392120361, "num_tokens": 4074750.0, "repeat_count": 1.0, - "routers_loss": 0.017167411744594574, + "routers_loss": 0.01791904680430889, "skip_count": 2.0, "step": 2528, "text_loss": 0.809726357460022 @@ -24033,13 +24033,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.0576171875, "learning_rate": 0.0009045994424321602, - "loss": 0.0101, + "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4078617.0, "repeat_count": 2.0, - "routers_loss": 0.019105618819594383, + "routers_loss": 0.016553178429603577, "skip_count": 2.0, "step": 2530, "text_loss": 0.8755000829696655 @@ -24052,13 +24052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.061767578125, "learning_rate": 0.0009044175131906817, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 4080936.0, "repeat_count": 0.0, - "routers_loss": 0.007993129082024097, + "routers_loss": 0.00884837657213211, "skip_count": 0.0, "step": 2532, "text_loss": 0.795871913433075 @@ -24071,13 +24071,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.05029296875, "learning_rate": 0.0009042354289756491, - "loss": 0.0124, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4084459.0, "repeat_count": 0.0, - "routers_loss": 0.0024954001419246197, + "routers_loss": 0.0024387789890170097, "skip_count": 0.0, "step": 2534, "text_loss": 0.18875400722026825 @@ -24090,13 +24090,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.0625, "learning_rate": 0.0009040531898568379, - "loss": 0.0169, + "loss": 0.0171, "macro_f1": 0.3333333432674408, "num_tokens": 4088464.0, "repeat_count": 0.0, - "routers_loss": 0.004360117018222809, + "routers_loss": 0.00491489190608263, "skip_count": 0.0, "step": 2536, "text_loss": 0.334369033575058 @@ -24109,13 +24109,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0927734375, + "grad_norm": 0.091796875, "learning_rate": 0.000903870795904082, - "loss": 0.0142, + "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 4091659.0, "repeat_count": 0.0, - "routers_loss": 0.00429064966738224, + "routers_loss": 0.004592662677168846, "skip_count": 2.0, "step": 2538, "text_loss": 0.21298295259475708 @@ -24130,11 +24130,11 @@ "f1_skip": 0.6666666865348816, "grad_norm": 0.0458984375, "learning_rate": 0.000903688247187275, - "loss": 0.0136, + "loss": 0.0137, "macro_f1": 0.5492662787437439, "num_tokens": 4095496.0, "repeat_count": 0.0, - "routers_loss": 0.0132954316213727, + "routers_loss": 0.011647242121398449, "skip_count": 2.0, "step": 2540, "text_loss": 0.2985081672668457 @@ -24147,13 +24147,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.04443359375, "learning_rate": 0.0009035055437763704, - "loss": 0.0129, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4098663.0, "repeat_count": 0.0, - "routers_loss": 0.002104961546137929, + "routers_loss": 0.0021238960325717926, "skip_count": 0.0, "step": 2542, "text_loss": 0.35359489917755127 @@ -24166,13 +24166,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.060791015625, + "grad_norm": 0.05859375, "learning_rate": 0.0009033226857413803, - "loss": 0.0167, + "loss": 0.0163, "macro_f1": 0.6666666865348816, "num_tokens": 4101588.0, "repeat_count": 1.0, - "routers_loss": 0.002973714144900441, + "routers_loss": 0.0024701557122170925, "skip_count": 0.0, "step": 2544, "text_loss": 1.1577601432800293 @@ -24185,13 +24185,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.080078125, "learning_rate": 0.000903139673152376, - "loss": 0.0119, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4104643.0, "repeat_count": 0.0, - "routers_loss": 0.002359170001000166, + "routers_loss": 0.002499542199075222, "skip_count": 0.0, "step": 2546, "text_loss": 1.0173401832580566 @@ -24204,13 +24204,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.059814453125, "learning_rate": 0.0009029565060794885, - "loss": 0.0168, + "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 4109247.0, "repeat_count": 0.0, - "routers_loss": 0.0033595687709748745, + "routers_loss": 0.0034200598020106554, "skip_count": 0.0, "step": 2548, "text_loss": 0.5690504312515259 @@ -24223,13 +24223,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.07421875, + "grad_norm": 0.06884765625, "learning_rate": 0.0009027731845929079, "loss": 0.0155, "macro_f1": 0.8823530077934265, "num_tokens": 4112597.0, "repeat_count": 1.0, - "routers_loss": 0.015323673374950886, + "routers_loss": 0.015981333330273628, "skip_count": 1.0, "step": 2550, "text_loss": 0.294549822807312 @@ -24242,13 +24242,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.043212890625, + "grad_norm": 0.06103515625, "learning_rate": 0.0009025897087628829, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.5492662787437439, "num_tokens": 4115844.0, "repeat_count": 0.0, - "routers_loss": 0.02122018299996853, + "routers_loss": 0.02606951631605625, "skip_count": 2.0, "step": 2552, "text_loss": 0.22692419588565826 @@ -24261,13 +24261,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.080078125, "learning_rate": 0.0009024060786597222, "loss": 0.0202, "macro_f1": 0.3333333432674408, "num_tokens": 4118634.0, "repeat_count": 0.0, - "routers_loss": 0.0010765352053567767, + "routers_loss": 0.001026194542646408, "skip_count": 0.0, "step": 2554, "text_loss": 0.6807059645652771 @@ -24280,13 +24280,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.04638671875, "learning_rate": 0.000902222294353793, - "loss": 0.0128, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4122024.0, "repeat_count": 0.0, - "routers_loss": 0.0017301233019679785, + "routers_loss": 0.001974924933165312, "skip_count": 0.0, "step": 2556, "text_loss": 0.7373668551445007 @@ -24299,13 +24299,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.04833984375, "learning_rate": 0.0009020383559155219, - "loss": 0.0056, + "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4124803.0, "repeat_count": 1.0, - "routers_loss": 0.004307204391807318, + "routers_loss": 0.004662613850086927, "skip_count": 2.0, "step": 2558, "text_loss": 0.21808166801929474 @@ -24318,13 +24318,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.0263671875, "learning_rate": 0.0009018542634153943, - "loss": 0.0064, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 4127680.0, "repeat_count": 0.0, - "routers_loss": 0.0073805381543934345, + "routers_loss": 0.006881687790155411, "skip_count": 0.0, "step": 2560, "text_loss": 0.25192978978157043 @@ -24339,11 +24339,11 @@ "f1_skip": 1.0, "grad_norm": 0.049560546875, "learning_rate": 0.0009016700169239551, - "loss": 0.0108, + "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 4130431.0, "repeat_count": 1.0, - "routers_loss": 0.005493874195963144, + "routers_loss": 0.005977808032184839, "skip_count": 1.0, "step": 2562, "text_loss": 0.4700816869735718 @@ -24356,13 +24356,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.068359375, "learning_rate": 0.0009014856165118075, - "loss": 0.0154, + "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 4133535.0, "repeat_count": 0.0, - "routers_loss": 0.006889877840876579, + "routers_loss": 0.007005698047578335, "skip_count": 1.0, "step": 2564, "text_loss": 0.6558199524879456 @@ -24375,13 +24375,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.030517578125, "learning_rate": 0.0009013010622496144, - "loss": 0.009, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4136534.0, "repeat_count": 0.0, - "routers_loss": 0.008495541289448738, + "routers_loss": 0.007262171246111393, "skip_count": 0.0, "step": 2566, "text_loss": 0.2565421462059021 @@ -24394,13 +24394,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.044921875, + "grad_norm": 0.043212890625, "learning_rate": 0.0009011163542080971, - "loss": 0.0089, + "loss": 0.0088, "macro_f1": 0.5934640765190125, "num_tokens": 4139762.0, "repeat_count": 0.0, - "routers_loss": 0.05929862707853317, + "routers_loss": 0.05431923270225525, "skip_count": 3.0, "step": 2568, "text_loss": 0.19896510243415833 @@ -24413,13 +24413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.026611328125, "learning_rate": 0.0009009314924580363, - "loss": 0.0086, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4143398.0, "repeat_count": 0.0, - "routers_loss": 0.0033934004604816437, + "routers_loss": 0.003667369019240141, "skip_count": 0.0, "step": 2570, "text_loss": 0.6581419110298157 @@ -24432,13 +24432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052978515625, "learning_rate": 0.0009007464770702712, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4146248.0, "repeat_count": 0.0, - "routers_loss": 0.0012826769379898906, + "routers_loss": 0.00132099783513695, "skip_count": 0.0, "step": 2572, "text_loss": 0.5316711068153381 @@ -24451,13 +24451,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038818359375, "learning_rate": 0.0009005613081157002, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 4149455.0, "repeat_count": 0.0, - "routers_loss": 0.0019460092298686504, + "routers_loss": 0.0020061524119228125, "skip_count": 0.0, "step": 2574, "text_loss": 0.5400773882865906 @@ -24470,13 +24470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.064453125, + "grad_norm": 0.05517578125, "learning_rate": 0.0009003759856652802, - "loss": 0.0112, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4152774.0, "repeat_count": 0.0, - "routers_loss": 0.004493138287216425, + "routers_loss": 0.002621434163302183, "skip_count": 1.0, "step": 2576, "text_loss": 0.3672606945037842 @@ -24489,13 +24489,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055908203125, + "grad_norm": 0.051513671875, "learning_rate": 0.0009001905097900273, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4155835.0, "repeat_count": 0.0, - "routers_loss": 0.005607665050774813, + "routers_loss": 0.005290219560265541, "skip_count": 0.0, "step": 2578, "text_loss": 0.8159038424491882 @@ -24508,13 +24508,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.040771484375, "learning_rate": 0.0009000048805610161, - "loss": 0.0123, + "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 4158874.0, "repeat_count": 0.0, - "routers_loss": 0.0015080278972163796, + "routers_loss": 0.0013576085912063718, "skip_count": 0.0, "step": 2580, "text_loss": 0.5518951416015625 @@ -24529,11 +24529,11 @@ "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.00089981909804938, - "loss": 0.0142, + "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 4162076.0, "repeat_count": 0.0, - "routers_loss": 0.0022276053205132484, + "routers_loss": 0.0021483441814780235, "skip_count": 0.0, "step": 2582, "text_loss": 0.43552228808403015 @@ -24546,13 +24546,13 @@ "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, - "grad_norm": 0.07421875, + "grad_norm": 0.068359375, "learning_rate": 0.0008996331623263114, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.7795917987823486, "num_tokens": 4165041.0, "repeat_count": 1.0, - "routers_loss": 0.0499282106757164, + "routers_loss": 0.0544300302863121, "skip_count": 4.0, "step": 2584, "text_loss": 0.24812501668930054 @@ -24565,13 +24565,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.047607421875, "learning_rate": 0.0008994470734630611, - "loss": 0.01, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4168290.0, "repeat_count": 0.0, - "routers_loss": 0.0016360745066776872, + "routers_loss": 0.0017150711501017213, "skip_count": 0.0, "step": 2586, "text_loss": 0.6392097473144531 @@ -24584,32 +24584,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008992608315309388, - "loss": 0.0149, + "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4171310.0, "repeat_count": 0.0, - "routers_loss": 0.0037772543728351593, + "routers_loss": 0.0046473173424601555, "skip_count": 2.0, "step": 2588, "text_loss": 0.6534156799316406 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 12.15967126504256, - "f1_execute": 0.9615384340286255, + "f1_execute": 0.943396270275116, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.060791015625, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, "learning_rate": 0.0008990744366013125, - "loss": 0.0104, - "macro_f1": 0.6538461446762085, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, "num_tokens": 4174042.0, "repeat_count": 2.0, - "routers_loss": 0.05992122367024422, + "routers_loss": 0.060913100838661194, "skip_count": 1.0, "step": 2590, "text_loss": 0.5365690588951111 @@ -24622,13 +24622,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.05859375, + "grad_norm": 0.055419921875, "learning_rate": 0.0008988878887456093, "loss": 0.0118, "macro_f1": 0.6051587462425232, "num_tokens": 4177666.0, "repeat_count": 1.0, - "routers_loss": 0.0679154023528099, + "routers_loss": 0.06268956512212753, "skip_count": 4.0, "step": 2592, "text_loss": 0.226226806640625 @@ -24643,11 +24643,11 @@ "f1_skip": 0.0, "grad_norm": 0.03662109375, "learning_rate": 0.0008987011880353149, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.32098764181137085, "num_tokens": 4180490.0, "repeat_count": 0.0, - "routers_loss": 0.03284052759408951, + "routers_loss": 0.030141465365886688, "skip_count": 2.0, "step": 2594, "text_loss": 0.2581401765346527 @@ -24660,13 +24660,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.051513671875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008985143345419729, - "loss": 0.0087, + "loss": 0.0082, "macro_f1": 0.5492662787437439, "num_tokens": 4183300.0, "repeat_count": 0.0, - "routers_loss": 0.01971421390771866, + "routers_loss": 0.018745863810181618, "skip_count": 2.0, "step": 2596, "text_loss": 0.7778542637825012 @@ -24679,13 +24679,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0703125, + "grad_norm": 0.064453125, "learning_rate": 0.0008983273283371862, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.5492662787437439, "num_tokens": 4186535.0, "repeat_count": 0.0, - "routers_loss": 0.028065117076039314, + "routers_loss": 0.026792079210281372, "skip_count": 2.0, "step": 2598, "text_loss": 0.34700271487236023 @@ -24698,13 +24698,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.048828125, "learning_rate": 0.0008981401694926159, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4189082.0, "repeat_count": 0.0, - "routers_loss": 0.00166845612693578, + "routers_loss": 0.001914160675369203, "skip_count": 0.0, "step": 2600, "text_loss": 0.6879339218139648 @@ -24717,13 +24717,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.06396484375, "learning_rate": 0.0008979528580799815, - "loss": 0.0138, + "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 4192330.0, "repeat_count": 0.0, - "routers_loss": 0.007527270819991827, + "routers_loss": 0.007978348061442375, "skip_count": 2.0, "step": 2602, "text_loss": 0.3524550497531891 @@ -24736,13 +24736,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008977653941710613, - "loss": 0.0137, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4196117.0, "repeat_count": 2.0, - "routers_loss": 0.00412185862660408, + "routers_loss": 0.0035376469604671, "skip_count": 0.0, "step": 2604, "text_loss": 0.42356348037719727 @@ -24755,13 +24755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.05810546875, "learning_rate": 0.0008975777778376916, - "loss": 0.0157, + "loss": 0.0156, "macro_f1": 0.6666666865348816, "num_tokens": 4200423.0, "repeat_count": 0.0, - "routers_loss": 0.007787751499563456, + "routers_loss": 0.008262477815151215, "skip_count": 1.0, "step": 2606, "text_loss": 0.5272893905639648 @@ -24774,13 +24774,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0732421875, "learning_rate": 0.0008973900091517675, "loss": 0.0114, "macro_f1": 0.3272727429866791, "num_tokens": 4203257.0, "repeat_count": 0.0, - "routers_loss": 0.024111779406666756, + "routers_loss": 0.022957922890782356, "skip_count": 1.0, "step": 2608, "text_loss": 0.2713734805583954 @@ -24793,13 +24793,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.045166015625, + "grad_norm": 0.043701171875, "learning_rate": 0.000897202088185242, - "loss": 0.0091, + "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4206243.0, "repeat_count": 0.0, - "routers_loss": 0.0057326615788042545, + "routers_loss": 0.006623407825827599, "skip_count": 2.0, "step": 2610, "text_loss": 0.5920525789260864 @@ -24812,13 +24812,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0517578125, "learning_rate": 0.0008970140150101274, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4209264.0, "repeat_count": 0.0, - "routers_loss": 0.0008877563523128629, + "routers_loss": 0.0008602747693657875, "skip_count": 0.0, "step": 2612, "text_loss": 0.33421996235847473 @@ -24831,13 +24831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.030517578125, "learning_rate": 0.0008968257896984932, - "loss": 0.0067, + "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4212058.0, "repeat_count": 0.0, - "routers_loss": 0.0039034869987517595, + "routers_loss": 0.0024653903674334288, "skip_count": 1.0, "step": 2614, "text_loss": 0.37923356890678406 @@ -24850,13 +24850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.06298828125, "learning_rate": 0.0008966374123224677, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4214929.0, "repeat_count": 0.0, - "routers_loss": 0.01140254084020853, + "routers_loss": 0.010878405533730984, "skip_count": 0.0, "step": 2616, "text_loss": 0.4350503981113434 @@ -24869,13 +24869,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.0303955078125, "learning_rate": 0.0008964488829542376, "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4219170.0, "repeat_count": 0.0, - "routers_loss": 0.028559349477291107, + "routers_loss": 0.02864212542772293, "skip_count": 1.0, "step": 2618, "text_loss": 0.26250728964805603 @@ -24888,13 +24888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.061279296875, + "grad_norm": 0.062255859375, "learning_rate": 0.0008962602016660478, - "loss": 0.0097, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4222077.0, "repeat_count": 0.0, - "routers_loss": 0.010525460354983807, + "routers_loss": 0.010444172658026218, "skip_count": 2.0, "step": 2620, "text_loss": 0.4718937575817108 @@ -24907,13 +24907,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008960713685302011, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4225383.0, "repeat_count": 0.0, - "routers_loss": 0.005284689832478762, + "routers_loss": 0.006409442983567715, "skip_count": 1.0, "step": 2622, "text_loss": 0.30420538783073425 @@ -24926,13 +24926,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.02978515625, "learning_rate": 0.0008958823836190588, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 4228349.0, "repeat_count": 0.0, - "routers_loss": 0.011040215380489826, + "routers_loss": 0.009996986016631126, "skip_count": 1.0, "step": 2624, "text_loss": 0.5392362475395203 @@ -24945,13 +24945,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.031494140625, "learning_rate": 0.0008956932470050404, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 4232007.0, "repeat_count": 0.0, - "routers_loss": 0.0014406041009351611, + "routers_loss": 0.0014383369125425816, "skip_count": 0.0, "step": 2626, "text_loss": 0.7112401127815247 @@ -24964,13 +24964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.072265625, + "grad_norm": 0.058349609375, "learning_rate": 0.0008955039587606233, - "loss": 0.0111, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4235122.0, "repeat_count": 0.0, - "routers_loss": 0.007106760982424021, + "routers_loss": 0.00781513936817646, "skip_count": 3.0, "step": 2628, "text_loss": 0.17802883684635162 @@ -24983,13 +24983,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0400390625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008953145189583429, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.542222261428833, "num_tokens": 4238248.0, "repeat_count": 0.0, - "routers_loss": 0.06423533707857132, + "routers_loss": 0.062252625823020935, "skip_count": 4.0, "step": 2630, "text_loss": 0.5551572442054749 @@ -25002,13 +25002,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.062255859375, "learning_rate": 0.0008951249276707933, - "loss": 0.012, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4241042.0, "repeat_count": 0.0, - "routers_loss": 0.0010294591775164008, + "routers_loss": 0.0011421777307987213, "skip_count": 0.0, "step": 2632, "text_loss": 0.7092233896255493 @@ -25021,13 +25021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008949351849706261, - "loss": 0.0122, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4243939.0, "repeat_count": 0.0, - "routers_loss": 0.0032732547260820866, + "routers_loss": 0.0032689040526747704, "skip_count": 0.0, "step": 2634, "text_loss": 0.19925718009471893 @@ -25040,13 +25040,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.033935546875, "learning_rate": 0.0008947452909305509, - "loss": 0.0112, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 4247535.0, "repeat_count": 1.0, - "routers_loss": 0.0021109411027282476, + "routers_loss": 0.002066014800220728, "skip_count": 0.0, "step": 2636, "text_loss": 0.5249715447425842 @@ -25059,13 +25059,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.11279296875, + "grad_norm": 0.09326171875, "learning_rate": 0.0008945552456233356, "loss": 0.0169, "macro_f1": 0.8820862174034119, "num_tokens": 4251441.0, "repeat_count": 2.0, - "routers_loss": 0.029545020312070847, + "routers_loss": 0.029332537204027176, "skip_count": 2.0, "step": 2638, "text_loss": 0.19229578971862793 @@ -25078,13 +25078,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.078125, "learning_rate": 0.0008943650491218058, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4254314.0, "repeat_count": 0.0, - "routers_loss": 0.0075805820524692535, + "routers_loss": 0.0075911120511591434, "skip_count": 0.0, "step": 2640, "text_loss": 0.27059751749038696 @@ -25097,13 +25097,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.044189453125, "learning_rate": 0.0008941747014988453, - "loss": 0.0155, + "loss": 0.0156, "macro_f1": 0.3333333432674408, "num_tokens": 4257442.0, "repeat_count": 0.0, - "routers_loss": 0.008832095190882683, + "routers_loss": 0.009030844084918499, "skip_count": 0.0, "step": 2642, "text_loss": 0.36747801303863525 @@ -25116,13 +25116,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.080078125, + "grad_norm": 0.123046875, "learning_rate": 0.0008939842028273956, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4260386.0, "repeat_count": 0.0, - "routers_loss": 0.008952614851295948, + "routers_loss": 0.007844001986086369, "skip_count": 1.0, "step": 2644, "text_loss": 0.6397647857666016 @@ -25135,13 +25135,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.0283203125, "learning_rate": 0.0008937935531804562, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4263516.0, "repeat_count": 0.0, - "routers_loss": 0.0017659157747402787, + "routers_loss": 0.0018789108144119382, "skip_count": 0.0, "step": 2646, "text_loss": 0.4795534908771515 @@ -25154,13 +25154,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.06494140625, "learning_rate": 0.0008936027526310844, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.3272727429866791, "num_tokens": 4266744.0, "repeat_count": 0.0, - "routers_loss": 0.03944230079650879, + "routers_loss": 0.0348590686917305, "skip_count": 1.0, "step": 2648, "text_loss": 0.27691999077796936 @@ -25173,13 +25173,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.07275390625, "learning_rate": 0.000893411801252395, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4269766.0, "repeat_count": 0.0, - "routers_loss": 0.0037144431844353676, + "routers_loss": 0.004543309565633535, "skip_count": 1.0, "step": 2650, "text_loss": 0.18867231905460358 @@ -25192,13 +25192,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0008932206991175615, - "loss": 0.0143, + "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 4273513.0, "repeat_count": 0.0, - "routers_loss": 0.003659905167296529, + "routers_loss": 0.0035277456045150757, "skip_count": 1.0, "step": 2652, "text_loss": 0.45613357424736023 @@ -25211,13 +25211,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.056640625, + "grad_norm": 0.055908203125, "learning_rate": 0.0008930294462998143, "loss": 0.015, "macro_f1": 0.6666666865348816, "num_tokens": 4276878.0, "repeat_count": 1.0, - "routers_loss": 0.011676746420562267, + "routers_loss": 0.011337592266499996, "skip_count": 0.0, "step": 2654, "text_loss": 0.24733254313468933 @@ -25230,13 +25230,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.0869140625, "learning_rate": 0.0008928380428724419, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4279915.0, "repeat_count": 0.0, - "routers_loss": 0.000998969655483961, + "routers_loss": 0.0010295971296727657, "skip_count": 1.0, "step": 2656, "text_loss": 0.41722849011421204 @@ -25249,13 +25249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.053955078125, "learning_rate": 0.0008926464889087903, - "loss": 0.0109, + "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4282888.0, "repeat_count": 0.0, - "routers_loss": 0.0016260759439319372, + "routers_loss": 0.0017198545392602682, "skip_count": 2.0, "step": 2658, "text_loss": 0.738322377204895 @@ -25268,13 +25268,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.068359375, "learning_rate": 0.0008924547844822634, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4285805.0, "repeat_count": 0.0, - "routers_loss": 0.0010900370543822646, + "routers_loss": 0.001339946174994111, "skip_count": 0.0, "step": 2660, "text_loss": 0.4802379906177521 @@ -25287,13 +25287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.05322265625, "learning_rate": 0.000892262929666323, - "loss": 0.0101, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4290282.0, "repeat_count": 0.0, - "routers_loss": 0.002275131642818451, + "routers_loss": 0.0022340165451169014, "skip_count": 0.0, "step": 2662, "text_loss": 0.6503544449806213 @@ -25306,13 +25306,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.03662109375, "learning_rate": 0.0008920709245344878, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4294106.0, "repeat_count": 0.0, - "routers_loss": 0.00575100164860487, + "routers_loss": 0.005288850050419569, "skip_count": 1.0, "step": 2664, "text_loss": 0.12312037497758865 @@ -25325,13 +25325,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.041259765625, "learning_rate": 0.0008918787691603347, - "loss": 0.0122, + "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 4298013.0, "repeat_count": 0.0, - "routers_loss": 0.004139711149036884, + "routers_loss": 0.004259659443050623, "skip_count": 1.0, "step": 2666, "text_loss": 0.3070000112056732 @@ -25344,13 +25344,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04052734375, "learning_rate": 0.000891686463617498, - "loss": 0.0072, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4300799.0, "repeat_count": 0.0, - "routers_loss": 0.008856390602886677, + "routers_loss": 0.009489355608820915, "skip_count": 1.0, "step": 2668, "text_loss": 0.18535588681697845 @@ -25363,13 +25363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0576171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008914940079796696, - "loss": 0.0116, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4304641.0, "repeat_count": 0.0, - "routers_loss": 0.002438562922179699, + "routers_loss": 0.0025417013093829155, "skip_count": 0.0, "step": 2670, "text_loss": 0.482585072517395 @@ -25382,13 +25382,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008913014023205988, "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4307462.0, "repeat_count": 0.0, - "routers_loss": 0.006435772404074669, + "routers_loss": 0.006371749565005302, "skip_count": 0.0, "step": 2672, "text_loss": 0.7064456939697266 @@ -25401,13 +25401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.039306640625, "learning_rate": 0.0008911086467140925, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4310396.0, "repeat_count": 0.0, - "routers_loss": 0.002773779444396496, + "routers_loss": 0.0027512952219694853, "skip_count": 0.0, "step": 2674, "text_loss": 0.23532851040363312 @@ -25420,13 +25420,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.05712890625, "learning_rate": 0.000890915741234015, - "loss": 0.0135, + "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 4314781.0, "repeat_count": 0.0, - "routers_loss": 0.00862761028110981, + "routers_loss": 0.008253013715147972, "skip_count": 1.0, "step": 2676, "text_loss": 0.30950358510017395 @@ -25439,13 +25439,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.03173828125, "learning_rate": 0.0008907226859542879, - "loss": 0.0104, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4317988.0, "repeat_count": 0.0, - "routers_loss": 0.005587176885455847, + "routers_loss": 0.005409995559602976, "skip_count": 2.0, "step": 2678, "text_loss": 0.4930732846260071 @@ -25458,13 +25458,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.060546875, "learning_rate": 0.0008905294809488907, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 4321014.0, "repeat_count": 1.0, - "routers_loss": 0.0033104203175753355, + "routers_loss": 0.0029942214023321867, "skip_count": 1.0, "step": 2680, "text_loss": 0.6224040389060974 @@ -25477,13 +25477,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08203125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008903361262918595, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4324268.0, "repeat_count": 0.0, - "routers_loss": 0.008205405436456203, + "routers_loss": 0.008411120623350143, "skip_count": 1.0, "step": 2682, "text_loss": 0.16296671330928802 @@ -25496,13 +25496,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052734375, + "grad_norm": 0.05126953125, "learning_rate": 0.0008901426220572884, - "loss": 0.0142, + "loss": 0.0138, "macro_f1": 1.0, "num_tokens": 4327494.0, "repeat_count": 2.0, - "routers_loss": 0.007884894497692585, + "routers_loss": 0.01039006095379591, "skip_count": 4.0, "step": 2684, "text_loss": 0.43866512179374695 @@ -25515,13 +25515,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.060791015625, "learning_rate": 0.0008899489683193286, - "loss": 0.011, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4330936.0, "repeat_count": 0.0, - "routers_loss": 0.0009336905204690993, + "routers_loss": 0.0009329111780971289, "skip_count": 0.0, "step": 2686, "text_loss": 0.44250962138175964 @@ -25534,13 +25534,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0810546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008897551651521885, "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4334123.0, "repeat_count": 0.0, - "routers_loss": 0.0033622782211750746, + "routers_loss": 0.003197216661646962, "skip_count": 0.0, "step": 2688, "text_loss": 0.48313501477241516 @@ -25553,13 +25553,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.09716796875, "learning_rate": 0.0008895612126301339, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 4337610.0, "repeat_count": 0.0, - "routers_loss": 0.0034563415683805943, + "routers_loss": 0.0033548236824572086, "skip_count": 0.0, "step": 2690, "text_loss": 0.4715327322483063 @@ -25572,13 +25572,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.051513671875, "learning_rate": 0.0008893671108274877, - "loss": 0.0115, + "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4341026.0, "repeat_count": 0.0, - "routers_loss": 0.0022277699317783117, + "routers_loss": 0.0024757643695920706, "skip_count": 0.0, "step": 2692, "text_loss": 0.43402785062789917 @@ -25591,13 +25591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.043212890625, "learning_rate": 0.0008891728598186302, - "loss": 0.011, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 4344422.0, "repeat_count": 0.0, - "routers_loss": 0.003892304375767708, + "routers_loss": 0.003317243419587612, "skip_count": 0.0, "step": 2694, "text_loss": 0.8498559594154358 @@ -25610,13 +25610,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0380859375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008889784596779986, - "loss": 0.0092, + "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 4347507.0, "repeat_count": 0.0, - "routers_loss": 0.015058296732604504, + "routers_loss": 0.01577926240861416, "skip_count": 3.0, "step": 2696, "text_loss": 0.5646669864654541 @@ -25629,13 +25629,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.10546875, + "grad_norm": 0.11328125, "learning_rate": 0.0008887839104800876, - "loss": 0.0118, + "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 4350414.0, "repeat_count": 0.0, - "routers_loss": 0.0033561652526259422, + "routers_loss": 0.002953822258859873, "skip_count": 0.0, "step": 2698, "text_loss": 0.5145012140274048 @@ -25648,13 +25648,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.05029296875, "learning_rate": 0.0008885892122994486, - "loss": 0.0116, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4354110.0, "repeat_count": 0.0, - "routers_loss": 0.0062471418641507626, + "routers_loss": 0.005849295295774937, "skip_count": 0.0, "step": 2700, "text_loss": 0.580982506275177 @@ -25667,13 +25667,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.0419921875, "learning_rate": 0.0008883943652106903, "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 4357323.0, "repeat_count": 1.0, - "routers_loss": 0.011802209541201591, + "routers_loss": 0.012347398325800896, "skip_count": 2.0, "step": 2702, "text_loss": 0.2234988808631897 @@ -25686,13 +25686,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06982421875, + "grad_norm": 0.0673828125, "learning_rate": 0.0008881993692884787, - "loss": 0.0132, + "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 4360228.0, "repeat_count": 0.0, - "routers_loss": 0.0041528744623064995, + "routers_loss": 0.003574999049305916, "skip_count": 1.0, "step": 2704, "text_loss": 0.4261806607246399 @@ -25705,13 +25705,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.048828125, "learning_rate": 0.0008880042246075365, - "loss": 0.0094, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4363905.0, "repeat_count": 0.0, - "routers_loss": 0.003151095937937498, + "routers_loss": 0.0031574300955981016, "skip_count": 0.0, "step": 2706, "text_loss": 0.691118061542511 @@ -25724,13 +25724,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008878089312426433, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4366736.0, "repeat_count": 0.0, - "routers_loss": 0.003142676781862974, + "routers_loss": 0.003195564029738307, "skip_count": 0.0, "step": 2708, "text_loss": 0.613926112651825 @@ -25743,13 +25743,13 @@ "f1_execute": 0.9583333134651184, "f1_repeat": 0.0, "f1_skip": 0.75, - "grad_norm": 0.05859375, + "grad_norm": 0.054443359375, "learning_rate": 0.0008876134892686363, "loss": 0.011, "macro_f1": 0.5694444179534912, "num_tokens": 4370146.0, "repeat_count": 0.0, - "routers_loss": 0.032964516431093216, + "routers_loss": 0.038784291595220566, "skip_count": 5.0, "step": 2710, "text_loss": 0.2723451852798462 @@ -25762,13 +25762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.0830078125, "learning_rate": 0.000887417898760409, - "loss": 0.0123, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 4373653.0, "repeat_count": 0.0, - "routers_loss": 0.0006848900229670107, + "routers_loss": 0.0006457131239585578, "skip_count": 0.0, "step": 2712, "text_loss": 0.31667640805244446 @@ -25781,13 +25781,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.10498046875, "learning_rate": 0.000887222159792912, - "loss": 0.0156, + "loss": 0.0155, "macro_f1": 0.6603773832321167, "num_tokens": 4376993.0, "repeat_count": 1.0, - "routers_loss": 0.04388813674449921, + "routers_loss": 0.045078590512275696, "skip_count": 1.0, "step": 2714, "text_loss": 0.5872798562049866 @@ -25800,13 +25800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.034912109375, "learning_rate": 0.0008870262724411528, - "loss": 0.0122, + "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 4380160.0, "repeat_count": 0.0, - "routers_loss": 0.003538437420502305, + "routers_loss": 0.003628545207902789, "skip_count": 0.0, "step": 2716, "text_loss": 0.7468157410621643 @@ -25819,13 +25819,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.1328125, + "grad_norm": 0.11181640625, "learning_rate": 0.0008868302367801962, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 0.6598639488220215, "num_tokens": 4383100.0, "repeat_count": 1.0, - "routers_loss": 0.05479869619011879, + "routers_loss": 0.05404464527964592, "skip_count": 3.0, "step": 2718, "text_loss": 0.2970244884490967 @@ -25838,13 +25838,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008866340528851629, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4386700.0, "repeat_count": 0.0, - "routers_loss": 0.0070296903140842915, + "routers_loss": 0.007000274024903774, "skip_count": 0.0, "step": 2720, "text_loss": 0.34521186351776123 @@ -25857,13 +25857,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05810546875, + "grad_norm": 0.052978515625, "learning_rate": 0.0008864377208312313, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.8823530077934265, "num_tokens": 4390299.0, "repeat_count": 1.0, - "routers_loss": 0.02051853947341442, + "routers_loss": 0.02025366574525833, "skip_count": 2.0, "step": 2722, "text_loss": 1.0536936521530151 @@ -25876,13 +25876,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.04638671875, "learning_rate": 0.000886241240693636, - "loss": 0.0096, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 4393353.0, "repeat_count": 0.0, - "routers_loss": 0.002662461483851075, + "routers_loss": 0.00251673418097198, "skip_count": 0.0, "step": 2724, "text_loss": 0.5678093433380127 @@ -25895,13 +25895,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.052001953125, "learning_rate": 0.0008860446125476686, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 4396446.0, "repeat_count": 1.0, - "routers_loss": 0.009321866557002068, + "routers_loss": 0.009532532654702663, "skip_count": 0.0, "step": 2726, "text_loss": 0.23775041103363037 @@ -25914,13 +25914,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.109375, + "grad_norm": 0.091796875, "learning_rate": 0.0008858478364686776, - "loss": 0.0102, + "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 4399977.0, "repeat_count": 1.0, - "routers_loss": 0.01029124017804861, + "routers_loss": 0.008062181062996387, "skip_count": 0.0, "step": 2728, "text_loss": 0.18888695538043976 @@ -25933,13 +25933,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.035888671875, "learning_rate": 0.0008856509125320678, - "loss": 0.0082, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4404406.0, "repeat_count": 0.0, - "routers_loss": 0.0008023424888961017, + "routers_loss": 0.0007731119985692203, "skip_count": 0.0, "step": 2730, "text_loss": 0.47331541776657104 @@ -25952,13 +25952,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.0498046875, "learning_rate": 0.0008854538408133006, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 4407165.0, "repeat_count": 0.0, - "routers_loss": 0.003058656118810177, + "routers_loss": 0.003115242812782526, "skip_count": 1.0, "step": 2732, "text_loss": 0.491370290517807 @@ -25971,13 +25971,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008852566213878947, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4410101.0, "repeat_count": 0.0, - "routers_loss": 0.0010282890871167183, + "routers_loss": 0.0008958528051152825, "skip_count": 0.0, "step": 2734, "text_loss": 0.42188262939453125 @@ -25990,13 +25990,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.07421875, + "grad_norm": 0.07763671875, "learning_rate": 0.0008850592543314246, - "loss": 0.0123, + "loss": 0.0118, "macro_f1": 1.0, "num_tokens": 4413015.0, "repeat_count": 1.0, - "routers_loss": 0.014785367995500565, + "routers_loss": 0.01139112375676632, "skip_count": 1.0, "step": 2736, "text_loss": 0.4716498553752899 @@ -26009,13 +26009,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0576171875, "learning_rate": 0.0008848617397195218, - "loss": 0.0089, + "loss": 0.0084, "macro_f1": 0.6603773832321167, "num_tokens": 4416404.0, "repeat_count": 1.0, - "routers_loss": 0.017717093229293823, + "routers_loss": 0.01609630137681961, "skip_count": 1.0, "step": 2738, "text_loss": 0.19490821659564972 @@ -26028,13 +26028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.041015625, "learning_rate": 0.0008846640776278745, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4419408.0, "repeat_count": 0.0, - "routers_loss": 0.0011861984385177493, + "routers_loss": 0.001489170710556209, "skip_count": 0.0, "step": 2740, "text_loss": 0.6443108320236206 @@ -26047,13 +26047,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.0693359375, "learning_rate": 0.0008844662681322269, "loss": 0.0144, "macro_f1": 0.6666666865348816, "num_tokens": 4422067.0, "repeat_count": 1.0, - "routers_loss": 0.0013843412743881345, + "routers_loss": 0.0014755792217329144, "skip_count": 0.0, "step": 2742, "text_loss": 0.9150356650352478 @@ -26068,11 +26068,11 @@ "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0008842683113083801, - "loss": 0.0154, + "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 4425647.0, "repeat_count": 0.0, - "routers_loss": 0.010318896733224392, + "routers_loss": 0.008962674997746944, "skip_count": 1.0, "step": 2744, "text_loss": 0.7103227972984314 @@ -26085,13 +26085,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07861328125, + "grad_norm": 0.0751953125, "learning_rate": 0.0008840702072321915, - "loss": 0.0108, + "loss": 0.0104, "macro_f1": 0.6598639488220215, "num_tokens": 4428855.0, "repeat_count": 1.0, - "routers_loss": 0.029359478503465652, + "routers_loss": 0.02554207295179367, "skip_count": 3.0, "step": 2746, "text_loss": 0.27141591906547546 @@ -26104,13 +26104,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0234375, + "grad_norm": 0.0230712890625, "learning_rate": 0.0008838719559795751, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4432838.0, "repeat_count": 0.0, - "routers_loss": 0.0014995118835940957, + "routers_loss": 0.0011747616808861494, "skip_count": 0.0, "step": 2748, "text_loss": 0.4007738530635834 @@ -26123,13 +26123,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.03515625, + "grad_norm": 0.03466796875, "learning_rate": 0.0008836735576265009, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.5492662787437439, "num_tokens": 4435793.0, "repeat_count": 0.0, - "routers_loss": 0.017950648441910744, + "routers_loss": 0.017564335837960243, "skip_count": 2.0, "step": 2750, "text_loss": 0.5972410440444946 @@ -26142,13 +26142,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.044921875, "learning_rate": 0.0008834750122489956, - "loss": 0.0083, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 4438871.0, "repeat_count": 1.0, - "routers_loss": 0.0069067892618477345, + "routers_loss": 0.007004009559750557, "skip_count": 0.0, "step": 2752, "text_loss": 0.2294853925704956 @@ -26161,13 +26161,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051513671875, + "grad_norm": 0.06640625, "learning_rate": 0.0008832763199231423, - "loss": 0.0101, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 4441846.0, "repeat_count": 0.0, - "routers_loss": 0.0013944554375484586, + "routers_loss": 0.0014562139986082911, "skip_count": 0.0, "step": 2754, "text_loss": 0.722432017326355 @@ -26180,13 +26180,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.0751953125, "learning_rate": 0.0008830774807250802, "loss": 0.013, "macro_f1": 0.3272727429866791, "num_tokens": 4444786.0, "repeat_count": 1.0, - "routers_loss": 0.025158623233437538, + "routers_loss": 0.024773593991994858, "skip_count": 0.0, "step": 2756, "text_loss": 0.507905125617981 @@ -26199,13 +26199,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.05419921875, + "grad_norm": 0.049072265625, "learning_rate": 0.0008828784947310049, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 0.8823530077934265, "num_tokens": 4448442.0, "repeat_count": 1.0, - "routers_loss": 0.05205477401614189, + "routers_loss": 0.04959975928068161, "skip_count": 2.0, "step": 2758, "text_loss": 0.3617522418498993 @@ -26218,13 +26218,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10791015625, + "grad_norm": 0.1025390625, "learning_rate": 0.000882679362017168, "loss": 0.0149, "macro_f1": 1.0, "num_tokens": 4451401.0, "repeat_count": 1.0, - "routers_loss": 0.005898742936551571, + "routers_loss": 0.005783245898783207, "skip_count": 2.0, "step": 2760, "text_loss": 0.49187400937080383 @@ -26237,13 +26237,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.0791015625, "learning_rate": 0.0008824800826598778, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 4454537.0, "repeat_count": 0.0, - "routers_loss": 0.006758298724889755, + "routers_loss": 0.00656260596588254, "skip_count": 0.0, "step": 2762, "text_loss": 0.6823583245277405 @@ -26256,13 +26256,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.0546875, "learning_rate": 0.0008822806567354983, - "loss": 0.0109, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 4457706.0, "repeat_count": 1.0, - "routers_loss": 0.005730919074267149, + "routers_loss": 0.005298966076225042, "skip_count": 0.0, "step": 2764, "text_loss": 0.554322361946106 @@ -26275,13 +26275,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.046630859375, "learning_rate": 0.0008820810843204501, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3272727429866791, "num_tokens": 4460710.0, "repeat_count": 0.0, - "routers_loss": 0.03390989825129509, + "routers_loss": 0.03164982795715332, "skip_count": 1.0, "step": 2766, "text_loss": 0.1656961441040039 @@ -26294,13 +26294,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0849609375, + "grad_norm": 0.072265625, "learning_rate": 0.0008818813654912095, - "loss": 0.0165, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4464001.0, "repeat_count": 0.0, - "routers_loss": 0.0007058497285470366, + "routers_loss": 0.000715116853825748, "skip_count": 0.0, "step": 2768, "text_loss": 0.5818144083023071 @@ -26313,13 +26313,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.058837890625, + "grad_norm": 0.056396484375, "learning_rate": 0.0008816815003243093, - "loss": 0.0136, + "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 4467364.0, "repeat_count": 0.0, - "routers_loss": 0.0027468691114336252, + "routers_loss": 0.002851625671610236, "skip_count": 0.0, "step": 2770, "text_loss": 0.6068631410598755 @@ -26332,13 +26332,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033203125, "learning_rate": 0.0008814814888963383, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4470681.0, "repeat_count": 0.0, - "routers_loss": 0.00443003186956048, + "routers_loss": 0.004729873035103083, "skip_count": 1.0, "step": 2772, "text_loss": 0.5386646389961243 @@ -26351,13 +26351,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.04296875, "learning_rate": 0.000881281331283941, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4473734.0, "repeat_count": 0.0, - "routers_loss": 0.0031219064258038998, + "routers_loss": 0.0031853127293288708, "skip_count": 1.0, "step": 2774, "text_loss": 0.5695263147354126 @@ -26370,13 +26370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.033447265625, "learning_rate": 0.0008810810275638182, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4478404.0, "repeat_count": 0.0, - "routers_loss": 0.000846695271320641, + "routers_loss": 0.0008977465913631022, "skip_count": 0.0, "step": 2776, "text_loss": 0.4750773310661316 @@ -26389,13 +26389,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008808805778127269, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4481287.0, "repeat_count": 0.0, - "routers_loss": 0.0074167875573039055, + "routers_loss": 0.00469845999032259, "skip_count": 0.0, "step": 2778, "text_loss": 0.14078612625598907 @@ -26408,13 +26408,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04296875, + "grad_norm": 0.049560546875, "learning_rate": 0.0008806799821074796, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 4483929.0, "repeat_count": 0.0, - "routers_loss": 0.018358726054430008, + "routers_loss": 0.01789761893451214, "skip_count": 2.0, "step": 2780, "text_loss": 0.2167191207408905 @@ -26427,13 +26427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.056396484375, "learning_rate": 0.0008804792405249451, - "loss": 0.0124, + "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 4487468.0, "repeat_count": 0.0, - "routers_loss": 0.001094152103178203, + "routers_loss": 0.001018838956952095, "skip_count": 0.0, "step": 2782, "text_loss": 0.5424665212631226 @@ -26446,13 +26446,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.0498046875, + "grad_norm": 0.07373046875, "learning_rate": 0.000880278353142048, - "loss": 0.0075, + "loss": 0.0077, "macro_f1": 0.8200000524520874, "num_tokens": 4490942.0, "repeat_count": 1.0, - "routers_loss": 0.03035641834139824, + "routers_loss": 0.03260354697704315, "skip_count": 3.0, "step": 2784, "text_loss": 0.20994654297828674 @@ -26465,13 +26465,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.05322265625, "learning_rate": 0.0008800773200357683, - "loss": 0.0123, + "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 4493986.0, "repeat_count": 0.0, - "routers_loss": 0.002394269686192274, + "routers_loss": 0.003019835101440549, "skip_count": 0.0, "step": 2786, "text_loss": 0.5709528923034668 @@ -26484,13 +26484,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.034423828125, "learning_rate": 0.0008798761412831429, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4498232.0, "repeat_count": 0.0, - "routers_loss": 0.0028274122159928083, + "routers_loss": 0.00285192858427763, "skip_count": 0.0, "step": 2788, "text_loss": 0.5103896260261536 @@ -26503,13 +26503,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.044921875, "learning_rate": 0.0008796748169612634, - "loss": 0.0088, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4501231.0, "repeat_count": 0.0, - "routers_loss": 0.0012642849469557405, + "routers_loss": 0.0012469831854104996, "skip_count": 0.0, "step": 2790, "text_loss": 0.43669697642326355 @@ -26522,13 +26522,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.039794921875, "learning_rate": 0.0008794733471472778, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4504208.0, "repeat_count": 0.0, - "routers_loss": 0.010966303758323193, + "routers_loss": 0.011512776836752892, "skip_count": 1.0, "step": 2792, "text_loss": 0.2299770563840866 @@ -26541,13 +26541,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.03564453125, "learning_rate": 0.0008792717319183899, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4507013.0, "repeat_count": 0.0, - "routers_loss": 0.008194026537239552, + "routers_loss": 0.00834917277097702, "skip_count": 0.0, "step": 2794, "text_loss": 0.2130603939294815 @@ -26560,13 +26560,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.03076171875, "learning_rate": 0.0008790699713518587, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4510286.0, "repeat_count": 0.0, - "routers_loss": 0.008828429505228996, + "routers_loss": 0.008616939187049866, "skip_count": 2.0, "step": 2796, "text_loss": 0.4377101957798004 @@ -26579,13 +26579,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.02783203125, "learning_rate": 0.0008788680655249994, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4513762.0, "repeat_count": 0.0, - "routers_loss": 0.0038230866193771362, + "routers_loss": 0.003408568911254406, "skip_count": 0.0, "step": 2798, "text_loss": 0.435138463973999 @@ -26598,13 +26598,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03369140625, "learning_rate": 0.0008786660145151826, - "loss": 0.009, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 4516696.0, "repeat_count": 1.0, - "routers_loss": 0.0031088131945580244, + "routers_loss": 0.0029398901388049126, "skip_count": 0.0, "step": 2800, "text_loss": 0.3195655047893524 @@ -26617,13 +26617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.033203125, "learning_rate": 0.0008784638183998348, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4519760.0, "repeat_count": 0.0, - "routers_loss": 0.0014194221002981067, + "routers_loss": 0.0013777425047010183, "skip_count": 0.0, "step": 2802, "text_loss": 0.8129430413246155 @@ -26636,13 +26636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.032470703125, "learning_rate": 0.0008782614772564379, - "loss": 0.0099, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4522106.0, "repeat_count": 0.0, - "routers_loss": 0.0031931858975440264, + "routers_loss": 0.0031694830395281315, "skip_count": 0.0, "step": 2804, "text_loss": 0.18083660304546356 @@ -26655,13 +26655,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.064453125, "learning_rate": 0.0008780589911625293, - "loss": 0.0117, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 4525743.0, "repeat_count": 0.0, - "routers_loss": 0.0021834284998476505, + "routers_loss": 0.002161208540201187, "skip_count": 0.0, "step": 2806, "text_loss": 0.8228182792663574 @@ -26674,13 +26674,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0703125, + "grad_norm": 0.07177734375, "learning_rate": 0.0008778563601957021, - "loss": 0.0098, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 4529573.0, "repeat_count": 0.0, - "routers_loss": 0.0035390176344662905, + "routers_loss": 0.0028444856870919466, "skip_count": 1.0, "step": 2808, "text_loss": 0.3715563118457794 @@ -26693,13 +26693,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008776535844336049, - "loss": 0.0095, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4532452.0, "repeat_count": 0.0, - "routers_loss": 0.0038604713045060635, + "routers_loss": 0.003807213855907321, "skip_count": 0.0, "step": 2810, "text_loss": 0.6012523174285889 @@ -26712,13 +26712,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.0361328125, "learning_rate": 0.0008774506639539417, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4536077.0, "repeat_count": 0.0, - "routers_loss": 0.00669970503076911, + "routers_loss": 0.006698979996144772, "skip_count": 0.0, "step": 2812, "text_loss": 0.27097949385643005 @@ -26731,13 +26731,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.099609375, "learning_rate": 0.0008772475988344722, - "loss": 0.0132, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 4539057.0, "repeat_count": 0.0, - "routers_loss": 0.004594485275447369, + "routers_loss": 0.004849409218877554, "skip_count": 1.0, "step": 2814, "text_loss": 1.026973843574524 @@ -26750,13 +26750,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.04638671875, + "grad_norm": 0.041748046875, "learning_rate": 0.0008770443891530109, - "loss": 0.0116, + "loss": 0.0115, "macro_f1": 0.5934640765190125, "num_tokens": 4542253.0, "repeat_count": 0.0, - "routers_loss": 0.01891930215060711, + "routers_loss": 0.019148651510477066, "skip_count": 3.0, "step": 2816, "text_loss": 0.2717585563659668 @@ -26769,13 +26769,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052490234375, "learning_rate": 0.0008768410349874286, "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 4545047.0, "repeat_count": 1.0, - "routers_loss": 0.0247862096875906, + "routers_loss": 0.02231316640973091, "skip_count": 2.0, "step": 2818, "text_loss": 0.274346262216568 @@ -26788,13 +26788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008766375364156508, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4548371.0, "repeat_count": 0.0, - "routers_loss": 0.008566800504922867, + "routers_loss": 0.008014129474759102, "skip_count": 2.0, "step": 2820, "text_loss": 0.22850871086120605 @@ -26807,13 +26807,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.044189453125, "learning_rate": 0.0008764338935156586, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4551276.0, "repeat_count": 0.0, - "routers_loss": 0.0013546474510803819, + "routers_loss": 0.0014544493751600385, "skip_count": 0.0, "step": 2822, "text_loss": 0.6308462023735046 @@ -26826,13 +26826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0390625, "learning_rate": 0.000876230106365488, - "loss": 0.0122, + "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 4554143.0, "repeat_count": 0.0, - "routers_loss": 0.009204468689858913, + "routers_loss": 0.00818584579974413, "skip_count": 3.0, "step": 2824, "text_loss": 0.3484207093715668 @@ -26845,13 +26845,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0264892578125, "learning_rate": 0.0008760261750432312, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4557256.0, "repeat_count": 0.0, - "routers_loss": 0.00787584763020277, + "routers_loss": 0.006275608204305172, "skip_count": 3.0, "step": 2826, "text_loss": 0.1927330046892166 @@ -26864,13 +26864,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0380859375, "learning_rate": 0.0008758220996270348, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 4560202.0, "repeat_count": 2.0, - "routers_loss": 0.0057869357988238335, + "routers_loss": 0.0055974251590669155, "skip_count": 2.0, "step": 2828, "text_loss": 0.7796496748924255 @@ -26883,13 +26883,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008756178801951007, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3333333432674408, "num_tokens": 4563508.0, "repeat_count": 0.0, - "routers_loss": 0.0018274546600878239, + "routers_loss": 0.0019799957517534494, "skip_count": 0.0, "step": 2830, "text_loss": 0.49633297324180603 @@ -26902,13 +26902,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0458984375, "learning_rate": 0.0008754135168256865, - "loss": 0.0094, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4566776.0, "repeat_count": 0.0, - "routers_loss": 0.004527154844254255, + "routers_loss": 0.004538947716355324, "skip_count": 0.0, "step": 2832, "text_loss": 0.5346745252609253 @@ -26921,13 +26921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03857421875, "learning_rate": 0.0008752090095971044, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4569787.0, "repeat_count": 0.0, - "routers_loss": 0.0018263199599459767, + "routers_loss": 0.001663343166001141, "skip_count": 0.0, "step": 2834, "text_loss": 0.5524004697799683 @@ -26940,13 +26940,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.07373046875, "learning_rate": 0.000875004358587722, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4572813.0, "repeat_count": 0.0, - "routers_loss": 0.0022649941965937614, + "routers_loss": 0.0022988212294876575, "skip_count": 0.0, "step": 2836, "text_loss": 0.4232870042324066 @@ -26959,13 +26959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.038330078125, "learning_rate": 0.000874799563875962, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4575563.0, "repeat_count": 0.0, - "routers_loss": 0.00791149027645588, + "routers_loss": 0.007781553082168102, "skip_count": 1.0, "step": 2838, "text_loss": 0.19239822030067444 @@ -26978,13 +26978,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0341796875, + "grad_norm": 0.03515625, "learning_rate": 0.0008745946255403021, "loss": 0.0072, "macro_f1": 0.5492662787437439, "num_tokens": 4578117.0, "repeat_count": 0.0, - "routers_loss": 0.016813624650239944, + "routers_loss": 0.01872488670051098, "skip_count": 2.0, "step": 2840, "text_loss": 0.2148810178041458 @@ -26999,11 +26999,11 @@ "f1_skip": 1.0, "grad_norm": 0.04296875, "learning_rate": 0.0008743895436592749, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 4582330.0, "repeat_count": 1.0, - "routers_loss": 0.004429332446306944, + "routers_loss": 0.005634195636957884, "skip_count": 1.0, "step": 2842, "text_loss": 0.4929640591144562 @@ -27016,13 +27016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.048583984375, "learning_rate": 0.0008741843183114685, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4585765.0, "repeat_count": 0.0, - "routers_loss": 0.0007147722644731402, + "routers_loss": 0.0008928569150157273, "skip_count": 0.0, "step": 2844, "text_loss": 0.32702967524528503 @@ -27035,13 +27035,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044189453125, + "grad_norm": 0.0439453125, "learning_rate": 0.0008739789495755253, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4589000.0, "repeat_count": 0.0, - "routers_loss": 0.015438012778759003, + "routers_loss": 0.014715569093823433, "skip_count": 4.0, "step": 2846, "text_loss": 0.25125816464424133 @@ -27054,13 +27054,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.049560546875, "learning_rate": 0.0008737734375301433, - "loss": 0.0138, + "loss": 0.0135, "macro_f1": 0.3333333432674408, "num_tokens": 4592391.0, "repeat_count": 0.0, - "routers_loss": 0.0015892626252025366, + "routers_loss": 0.0017551190685480833, "skip_count": 0.0, "step": 2848, "text_loss": 0.6595172882080078 @@ -27073,13 +27073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.027099609375, "learning_rate": 0.0008735677822540749, - "loss": 0.0086, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4596662.0, "repeat_count": 0.0, - "routers_loss": 0.0006934175617061555, + "routers_loss": 0.0006456313421949744, "skip_count": 0.0, "step": 2850, "text_loss": 0.6290773153305054 @@ -27092,13 +27092,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.036865234375, "learning_rate": 0.0008733619838261276, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4599682.0, "repeat_count": 0.0, - "routers_loss": 0.006811433006078005, + "routers_loss": 0.00765060493722558, "skip_count": 2.0, "step": 2852, "text_loss": 0.3268161416053772 @@ -27111,13 +27111,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.044921875, + "grad_norm": 0.041015625, "learning_rate": 0.0008731560423251637, - "loss": 0.0104, + "loss": 0.01, "macro_f1": 1.0, "num_tokens": 4603324.0, "repeat_count": 1.0, - "routers_loss": 0.012574959546327591, + "routers_loss": 0.01161442045122385, "skip_count": 2.0, "step": 2854, "text_loss": 0.3029932975769043 @@ -27130,13 +27130,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.038818359375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008729499578301005, "loss": 0.0098, "macro_f1": 0.9555556178092957, "num_tokens": 4606975.0, "repeat_count": 1.0, - "routers_loss": 0.01913273334503174, + "routers_loss": 0.02055389992892742, "skip_count": 5.0, "step": 2856, "text_loss": 0.6268532872200012 @@ -27149,13 +27149,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.05078125, "learning_rate": 0.00087274373041991, - "loss": 0.0082, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4609629.0, "repeat_count": 0.0, - "routers_loss": 0.0012737065553665161, + "routers_loss": 0.0013911726418882608, "skip_count": 0.0, "step": 2858, "text_loss": 0.534355640411377 @@ -27168,13 +27168,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.053955078125, "learning_rate": 0.0008725373601736188, - "loss": 0.0079, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4612913.0, "repeat_count": 2.0, - "routers_loss": 0.009088932536542416, + "routers_loss": 0.01010701060295105, "skip_count": 0.0, "step": 2860, "text_loss": 0.3391380310058594 @@ -27187,13 +27187,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.0255126953125, "learning_rate": 0.0008723308471703085, - "loss": 0.0078, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4616718.0, "repeat_count": 0.0, - "routers_loss": 0.006364458240568638, + "routers_loss": 0.005969462916254997, "skip_count": 1.0, "step": 2862, "text_loss": 0.47250816226005554 @@ -27206,13 +27206,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.046630859375, "learning_rate": 0.0008721241914891152, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4619680.0, "repeat_count": 0.0, - "routers_loss": 0.002686808817088604, + "routers_loss": 0.0027780034579336643, "skip_count": 0.0, "step": 2864, "text_loss": 0.3249278664588928 @@ -27225,13 +27225,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.0439453125, "learning_rate": 0.0008719173932092295, - "loss": 0.0047, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4622700.0, "repeat_count": 0.0, - "routers_loss": 0.0018892486114054918, + "routers_loss": 0.0015912104863673449, "skip_count": 0.0, "step": 2866, "text_loss": 0.7789985537528992 @@ -27244,13 +27244,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049072265625, + "grad_norm": 0.05126953125, "learning_rate": 0.0008717104524098973, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 4626637.0, "repeat_count": 0.0, - "routers_loss": 0.0035258810967206955, + "routers_loss": 0.0036539011634886265, "skip_count": 0.0, "step": 2868, "text_loss": 0.619088351726532 @@ -27263,13 +27263,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.103515625, + "grad_norm": 0.10400390625, "learning_rate": 0.0008715033691704187, - "loss": 0.0121, + "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 4629863.0, "repeat_count": 0.0, - "routers_loss": 0.007305602077394724, + "routers_loss": 0.008402476087212563, "skip_count": 1.0, "step": 2870, "text_loss": 0.5550018548965454 @@ -27282,13 +27282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06298828125, "learning_rate": 0.0008712961435701479, - "loss": 0.0162, + "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 4632657.0, "repeat_count": 0.0, - "routers_loss": 0.012898211367428303, + "routers_loss": 0.01400839351117611, "skip_count": 1.0, "step": 2872, "text_loss": 0.17368625104427338 @@ -27301,13 +27301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0419921875, "learning_rate": 0.0008710887756884947, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4635885.0, "repeat_count": 0.0, - "routers_loss": 0.0013437134912237525, + "routers_loss": 0.0014573842054232955, "skip_count": 0.0, "step": 2874, "text_loss": 0.5138643383979797 @@ -27320,13 +27320,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.033447265625, "learning_rate": 0.0008708812656049225, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 4639341.0, "repeat_count": 0.0, - "routers_loss": 0.002090727211907506, + "routers_loss": 0.002810224425047636, "skip_count": 1.0, "step": 2876, "text_loss": 0.70310378074646 @@ -27341,11 +27341,11 @@ "f1_skip": 0.8571428656578064, "grad_norm": 0.03564453125, "learning_rate": 0.0008706736133989497, - "loss": 0.0107, + "loss": 0.0105, "macro_f1": 0.9449735879898071, "num_tokens": 4642163.0, "repeat_count": 2.0, - "routers_loss": 0.030176319181919098, + "routers_loss": 0.029783209785819054, "skip_count": 4.0, "step": 2878, "text_loss": 0.26898008584976196 @@ -27358,13 +27358,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04150390625, "learning_rate": 0.0008704658191501491, - "loss": 0.0091, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 4645858.0, "repeat_count": 0.0, - "routers_loss": 0.0009633690933696926, + "routers_loss": 0.0009193966398015618, "skip_count": 0.0, "step": 2880, "text_loss": 0.6047570705413818 @@ -27377,13 +27377,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.060302734375, + "grad_norm": 0.05908203125, "learning_rate": 0.0008702578829381475, "loss": 0.0131, "macro_f1": 0.8814815282821655, "num_tokens": 4649237.0, "repeat_count": 2.0, - "routers_loss": 0.0568491593003273, + "routers_loss": 0.05698608607053757, "skip_count": 4.0, "step": 2882, "text_loss": 0.10695219784975052 @@ -27396,13 +27396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.0311279296875, "learning_rate": 0.0008700498048426269, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4652362.0, "repeat_count": 0.0, - "routers_loss": 0.0012279651127755642, + "routers_loss": 0.0011786938412114978, "skip_count": 0.0, "step": 2884, "text_loss": 0.4442957937717438 @@ -27415,13 +27415,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.046142578125, "learning_rate": 0.0008698415849433229, - "loss": 0.0097, + "loss": 0.0092, "macro_f1": 0.5492662787437439, "num_tokens": 4655616.0, "repeat_count": 2.0, - "routers_loss": 0.02166076935827732, + "routers_loss": 0.02142646163702011, "skip_count": 0.0, "step": 2886, "text_loss": 0.5820964574813843 @@ -27434,13 +27434,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008696332233200262, - "loss": 0.012, + "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 4659294.0, "repeat_count": 0.0, - "routers_loss": 0.003944257274270058, + "routers_loss": 0.004038636106997728, "skip_count": 0.0, "step": 2888, "text_loss": 0.11847645789384842 @@ -27453,13 +27453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.0478515625, "learning_rate": 0.0008694247200525806, - "loss": 0.0092, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 4662512.0, "repeat_count": 0.0, - "routers_loss": 0.0013393335975706577, + "routers_loss": 0.0013256469974294305, "skip_count": 0.0, "step": 2890, "text_loss": 0.4873582720756531 @@ -27472,13 +27472,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008692160752208856, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.3272727429866791, "num_tokens": 4666190.0, "repeat_count": 0.0, - "routers_loss": 0.0443510003387928, + "routers_loss": 0.04477972164750099, "skip_count": 1.0, "step": 2892, "text_loss": 0.44243401288986206 @@ -27491,13 +27491,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.083984375, + "grad_norm": 0.09521484375, "learning_rate": 0.0008690072889048941, - "loss": 0.0125, + "loss": 0.0127, "macro_f1": 1.0, "num_tokens": 4668884.0, "repeat_count": 1.0, - "routers_loss": 0.0047337980940938, + "routers_loss": 0.004407547414302826, "skip_count": 2.0, "step": 2894, "text_loss": 0.6847127079963684 @@ -27510,13 +27510,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.04052734375, "learning_rate": 0.0008687983611846133, - "loss": 0.0082, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4672093.0, "repeat_count": 0.0, - "routers_loss": 0.0055244253017008305, + "routers_loss": 0.005245382897555828, "skip_count": 1.0, "step": 2896, "text_loss": 0.25583332777023315 @@ -27529,13 +27529,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.0458984375, "learning_rate": 0.0008685892921401049, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4674917.0, "repeat_count": 0.0, - "routers_loss": 0.001250729663297534, + "routers_loss": 0.0010470855049788952, "skip_count": 0.0, "step": 2898, "text_loss": 0.41998377442359924 @@ -27548,13 +27548,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.037841796875, "learning_rate": 0.0008683800818514844, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4677739.0, "repeat_count": 0.0, - "routers_loss": 0.00974183902144432, + "routers_loss": 0.009026622399687767, "skip_count": 2.0, "step": 2900, "text_loss": 0.303053081035614 @@ -27567,13 +27567,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.09619140625, "learning_rate": 0.0008681707303989215, - "loss": 0.0111, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 4680721.0, "repeat_count": 0.0, - "routers_loss": 0.004882345907390118, + "routers_loss": 0.004500916693359613, "skip_count": 0.0, "step": 2902, "text_loss": 0.5573288798332214 @@ -27586,13 +27586,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.06982421875, "learning_rate": 0.0008679612378626404, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4683339.0, "repeat_count": 0.0, - "routers_loss": 0.00568242697045207, + "routers_loss": 0.005047840531915426, "skip_count": 1.0, "step": 2904, "text_loss": 0.321353554725647 @@ -27605,13 +27605,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.03271484375, "learning_rate": 0.0008677516043229187, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 4686453.0, "repeat_count": 0.0, - "routers_loss": 0.010831202380359173, + "routers_loss": 0.010256914421916008, "skip_count": 1.0, "step": 2906, "text_loss": 0.4300784468650818 @@ -27624,13 +27624,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.05029296875, "learning_rate": 0.0008675418298600883, - "loss": 0.0087, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 4689645.0, "repeat_count": 1.0, - "routers_loss": 0.00235295994207263, + "routers_loss": 0.0022669637110084295, "skip_count": 0.0, "step": 2908, "text_loss": 0.5064885020256042 @@ -27643,13 +27643,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.048828125, "learning_rate": 0.0008673319145545358, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4692320.0, "repeat_count": 0.0, - "routers_loss": 0.0011642680037766695, + "routers_loss": 0.0011188550852239132, "skip_count": 0.0, "step": 2910, "text_loss": 0.7114819884300232 @@ -27662,13 +27662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03369140625, "learning_rate": 0.0008671218584867003, - "loss": 0.0104, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 4695116.0, "repeat_count": 0.0, - "routers_loss": 0.00278888875618577, + "routers_loss": 0.002966561820358038, "skip_count": 2.0, "step": 2912, "text_loss": 0.5662392973899841 @@ -27681,13 +27681,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.047607421875, "learning_rate": 0.0008669116617370762, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4698040.0, "repeat_count": 0.0, - "routers_loss": 0.0014630162622779608, + "routers_loss": 0.0012894890969619155, "skip_count": 0.0, "step": 2914, "text_loss": 0.718977689743042 @@ -27700,13 +27700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0986328125, + "grad_norm": 0.1552734375, "learning_rate": 0.0008667013243862111, - "loss": 0.0159, + "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 4700963.0, "repeat_count": 0.0, - "routers_loss": 0.0011393720051273704, + "routers_loss": 0.0007232456118799746, "skip_count": 0.0, "step": 2916, "text_loss": 0.3447718024253845 @@ -27719,13 +27719,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02978515625, + "grad_norm": 0.0289306640625, "learning_rate": 0.000866490846514707, - "loss": 0.0072, + "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 4704471.0, "repeat_count": 1.0, - "routers_loss": 0.014218449592590332, + "routers_loss": 0.015166680328547955, "skip_count": 0.0, "step": 2918, "text_loss": 0.454946368932724 @@ -27738,13 +27738,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052978515625, + "grad_norm": 0.04736328125, "learning_rate": 0.000866280228203219, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4707238.0, "repeat_count": 1.0, - "routers_loss": 0.005367610137909651, + "routers_loss": 0.0061312485486269, "skip_count": 1.0, "step": 2920, "text_loss": 0.721788227558136 @@ -27757,13 +27757,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048828125, + "grad_norm": 0.055908203125, "learning_rate": 0.0008660694695324564, - "loss": 0.0124, + "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 4711323.0, "repeat_count": 0.0, - "routers_loss": 0.0020303199999034405, + "routers_loss": 0.00169933564029634, "skip_count": 0.0, "step": 2922, "text_loss": 0.7562121748924255 @@ -27776,13 +27776,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.0654296875, "learning_rate": 0.0008658585705831829, - "loss": 0.0123, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4714417.0, "repeat_count": 0.0, - "routers_loss": 0.0022230520844459534, + "routers_loss": 0.0022731393110007048, "skip_count": 0.0, "step": 2924, "text_loss": 0.5726147890090942 @@ -27795,13 +27795,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.068359375, "learning_rate": 0.0008656475314362148, - "loss": 0.0133, + "loss": 0.0131, "macro_f1": 0.8817967176437378, "num_tokens": 4717445.0, "repeat_count": 2.0, - "routers_loss": 0.06414645165205002, + "routers_loss": 0.06477782875299454, "skip_count": 3.0, "step": 2926, "text_loss": 0.4505867660045624 @@ -27814,13 +27814,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0625, + "grad_norm": 0.06396484375, "learning_rate": 0.0008654363521724229, - "loss": 0.0128, + "loss": 0.0129, "macro_f1": 0.9449735879898071, "num_tokens": 4722253.0, "repeat_count": 2.0, - "routers_loss": 0.022727061063051224, + "routers_loss": 0.027405790984630585, "skip_count": 4.0, "step": 2928, "text_loss": 0.24767601490020752 @@ -27833,13 +27833,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.0537109375, "learning_rate": 0.0008652250328727315, - "loss": 0.0114, + "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 4725465.0, "repeat_count": 0.0, - "routers_loss": 0.006181784905493259, + "routers_loss": 0.006544729229062796, "skip_count": 2.0, "step": 2930, "text_loss": 0.4478724002838135 @@ -27852,13 +27852,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.0517578125, "learning_rate": 0.0008650135736181184, - "loss": 0.0133, + "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 4729213.0, "repeat_count": 1.0, - "routers_loss": 0.005527070257812738, + "routers_loss": 0.0055119614116847515, "skip_count": 0.0, "step": 2932, "text_loss": 0.6749323010444641 @@ -27871,13 +27871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.045166015625, "learning_rate": 0.0008648019744896154, - "loss": 0.0102, + "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 4732280.0, "repeat_count": 0.0, - "routers_loss": 0.008868738077580929, + "routers_loss": 0.008374541997909546, "skip_count": 0.0, "step": 2934, "text_loss": 0.4647359251976013 @@ -27890,13 +27890,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.06201171875, "learning_rate": 0.0008645902355683077, - "loss": 0.0089, + "loss": 0.0091, "macro_f1": 0.6595745086669922, "num_tokens": 4736244.0, "repeat_count": 1.0, - "routers_loss": 0.07285884022712708, + "routers_loss": 0.068686343729496, "skip_count": 4.0, "step": 2936, "text_loss": 0.5356017351150513 @@ -27909,13 +27909,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.042236328125, "learning_rate": 0.0008643783569353339, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4739810.0, "repeat_count": 2.0, - "routers_loss": 0.019306030124425888, + "routers_loss": 0.017954571172595024, "skip_count": 0.0, "step": 2938, "text_loss": 0.3145926296710968 @@ -27928,13 +27928,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.054443359375, "learning_rate": 0.0008641663386718863, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4742720.0, "repeat_count": 0.0, - "routers_loss": 0.00626454409211874, + "routers_loss": 0.006261351052671671, "skip_count": 1.0, "step": 2940, "text_loss": 0.3200613856315613 @@ -27949,11 +27949,11 @@ "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008639541808592109, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 4745870.0, "repeat_count": 1.0, - "routers_loss": 0.0019172134343534708, + "routers_loss": 0.0025341357104480267, "skip_count": 1.0, "step": 2942, "text_loss": 0.5020416378974915 @@ -27968,11 +27968,11 @@ "f1_skip": 1.0, "grad_norm": 0.025634765625, "learning_rate": 0.0008637418835786067, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4748943.0, "repeat_count": 0.0, - "routers_loss": 0.009745351038873196, + "routers_loss": 0.008970048278570175, "skip_count": 2.0, "step": 2944, "text_loss": 0.14517110586166382 @@ -27985,13 +27985,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.055908203125, "learning_rate": 0.0008635294469114265, - "loss": 0.011, + "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 4751360.0, "repeat_count": 0.0, - "routers_loss": 0.0020624736789613962, + "routers_loss": 0.002133632078766823, "skip_count": 0.0, "step": 2946, "text_loss": 0.5367856025695801 @@ -28004,13 +28004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.091796875, + "grad_norm": 0.08837890625, "learning_rate": 0.0008633168709390766, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 4754403.0, "repeat_count": 0.0, - "routers_loss": 0.001082106726244092, + "routers_loss": 0.0011866620043292642, "skip_count": 0.0, "step": 2948, "text_loss": 0.38302522897720337 @@ -28023,13 +28023,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.037109375, "learning_rate": 0.0008631041557430163, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4757867.0, "repeat_count": 2.0, - "routers_loss": 0.0026527612935751677, + "routers_loss": 0.0026854004245251417, "skip_count": 0.0, "step": 2950, "text_loss": 0.43433454632759094 @@ -28042,13 +28042,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.05859375, "learning_rate": 0.0008628913014047585, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4761171.0, "repeat_count": 0.0, - "routers_loss": 0.0027245471719652414, + "routers_loss": 0.002433479530736804, "skip_count": 0.0, "step": 2952, "text_loss": 0.4725971519947052 @@ -28061,13 +28061,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.028564453125, "learning_rate": 0.0008626783080058696, - "loss": 0.0065, + "loss": 0.0066, "macro_f1": 0.3272727429866791, "num_tokens": 4764752.0, "repeat_count": 1.0, - "routers_loss": 0.01764744706451893, + "routers_loss": 0.017182493582367897, "skip_count": 0.0, "step": 2954, "text_loss": 0.460641473531723 @@ -28080,13 +28080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0859375, + "grad_norm": 0.12353515625, "learning_rate": 0.0008624651756279687, - "loss": 0.0196, + "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 4767453.0, "repeat_count": 0.0, - "routers_loss": 0.0019560824148356915, + "routers_loss": 0.0018134774873033166, "skip_count": 0.0, "step": 2956, "text_loss": 0.4091459810733795 @@ -28099,13 +28099,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.053466796875, "learning_rate": 0.000862251904352729, "loss": 0.0108, "macro_f1": 0.9259259104728699, "num_tokens": 4771110.0, "repeat_count": 3.0, - "routers_loss": 0.03031078353524208, + "routers_loss": 0.0365753099322319, "skip_count": 3.0, "step": 2958, "text_loss": 0.22408585250377655 @@ -28118,13 +28118,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.05029296875, "learning_rate": 0.000862038494261876, "loss": 0.0109, "macro_f1": 0.3272727429866791, "num_tokens": 4774464.0, "repeat_count": 0.0, - "routers_loss": 0.024790454655885696, + "routers_loss": 0.024343067780137062, "skip_count": 1.0, "step": 2960, "text_loss": 0.16483014822006226 @@ -28137,13 +28137,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.0654296875, "learning_rate": 0.0008618249454371891, - "loss": 0.0099, + "loss": 0.01, "macro_f1": 0.3333333432674408, "num_tokens": 4777894.0, "repeat_count": 0.0, - "routers_loss": 0.0008704765350557864, + "routers_loss": 0.0008310087723657489, "skip_count": 0.0, "step": 2962, "text_loss": 0.5573428869247437 @@ -28156,13 +28156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.043212890625, "learning_rate": 0.0008616112579605006, - "loss": 0.0116, + "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 4781116.0, "repeat_count": 0.0, - "routers_loss": 0.0066874073818326, + "routers_loss": 0.0065494864247739315, "skip_count": 0.0, "step": 2964, "text_loss": 0.18816794455051422 @@ -28175,13 +28175,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.04248046875, "learning_rate": 0.0008613974319136957, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4784886.0, "repeat_count": 0.0, - "routers_loss": 0.0021798228845000267, + "routers_loss": 0.0019726944155991077, "skip_count": 0.0, "step": 2966, "text_loss": 0.5097305774688721 @@ -28194,13 +28194,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.076171875, + "grad_norm": 0.0849609375, "learning_rate": 0.0008611834673787134, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 4787563.0, "repeat_count": 0.0, - "routers_loss": 0.0063707553781569, + "routers_loss": 0.006327496841549873, "skip_count": 0.0, "step": 2968, "text_loss": 0.6953814029693604 @@ -28213,13 +28213,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 0.5, "f1_skip": 1.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.056884765625, "learning_rate": 0.0008609693644375449, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.8200000524520874, "num_tokens": 4790421.0, "repeat_count": 3.0, - "routers_loss": 0.044509731233119965, + "routers_loss": 0.042896661907434464, "skip_count": 1.0, "step": 2970, "text_loss": 0.2573051154613495 @@ -28227,18 +28227,18 @@ { "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 13.953331376577633, - "f1_execute": 0.9795917868614197, + "f1_execute": 1.0, "f1_repeat": 1.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.1640625, + "f1_skip": 1.0, + "grad_norm": 0.14453125, "learning_rate": 0.000860755123172235, - "loss": 0.01, - "macro_f1": 0.8820862174034119, + "loss": 0.0096, + "macro_f1": 1.0, "num_tokens": 4793786.0, "repeat_count": 2.0, - "routers_loss": 0.01667599380016327, + "routers_loss": 0.013228793628513813, "skip_count": 1.0, "step": 2972, "text_loss": 0.46614497900009155 @@ -28251,13 +28251,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0296630859375, "learning_rate": 0.0008605407436648815, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4796864.0, "repeat_count": 0.0, - "routers_loss": 0.008433761075139046, + "routers_loss": 0.007294759154319763, "skip_count": 2.0, "step": 2974, "text_loss": 0.21555091440677643 @@ -28270,13 +28270,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.059814453125, + "grad_norm": 0.057861328125, "learning_rate": 0.0008603262259976348, - "loss": 0.0131, + "loss": 0.0129, "macro_f1": 1.0, "num_tokens": 4800080.0, "repeat_count": 1.0, - "routers_loss": 0.002439796691760421, + "routers_loss": 0.0024024227168411016, "skip_count": 5.0, "step": 2976, "text_loss": 0.7855485081672668 @@ -28289,13 +28289,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.07666015625, "learning_rate": 0.0008601115702526987, - "loss": 0.0112, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4802899.0, "repeat_count": 0.0, - "routers_loss": 0.0015027766348794103, + "routers_loss": 0.001433031284250319, "skip_count": 0.0, "step": 2978, "text_loss": 0.6777765154838562 @@ -28308,13 +28308,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06103515625, + "grad_norm": 0.04931640625, "learning_rate": 0.0008598967765123293, - "loss": 0.0091, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4805835.0, "repeat_count": 0.0, - "routers_loss": 0.003235677955672145, + "routers_loss": 0.003073975909501314, "skip_count": 0.0, "step": 2980, "text_loss": 0.5926910638809204 @@ -28322,18 +28322,18 @@ { "acc_repeat": 1.0, "acc_skip": 0.5, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.0, - "f1_execute": 0.9090908765792847, - "f1_repeat": 0.6666666865348816, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.052734375, + "grad_norm": 0.05322265625, "learning_rate": 0.0008596818448588364, - "loss": 0.0141, - "macro_f1": 0.7474747896194458, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, "num_tokens": 4809028.0, "repeat_count": 1.0, - "routers_loss": 0.063179150223732, + "routers_loss": 0.06438573449850082, "skip_count": 6.0, "step": 2982, "text_loss": 0.23975612223148346 @@ -28346,13 +28346,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0302734375, "learning_rate": 0.0008594667753745821, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3272727429866791, "num_tokens": 4812831.0, "repeat_count": 0.0, - "routers_loss": 0.015444152988493443, + "routers_loss": 0.014817612245678902, "skip_count": 1.0, "step": 2984, "text_loss": 0.17292268574237823 @@ -28365,13 +28365,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.07421875, "learning_rate": 0.0008592515681419813, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 0.5492662787437439, "num_tokens": 4816005.0, "repeat_count": 2.0, - "routers_loss": 0.02485196851193905, + "routers_loss": 0.025407327339053154, "skip_count": 0.0, "step": 2986, "text_loss": 0.6403061151504517 @@ -28384,13 +28384,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.0615234375, "learning_rate": 0.0008590362232435018, - "loss": 0.0102, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 4818901.0, "repeat_count": 0.0, - "routers_loss": 0.006175600457936525, + "routers_loss": 0.006826757453382015, "skip_count": 0.0, "step": 2988, "text_loss": 0.2572069466114044 @@ -28403,13 +28403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.04052734375, "learning_rate": 0.0008588207407616644, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4823120.0, "repeat_count": 0.0, - "routers_loss": 0.0008576468680985272, + "routers_loss": 0.0009054148104041815, "skip_count": 0.0, "step": 2990, "text_loss": 0.4827076196670532 @@ -28422,13 +28422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02392578125, + "grad_norm": 0.0247802734375, "learning_rate": 0.0008586051207790422, - "loss": 0.0059, + "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4825774.0, "repeat_count": 0.0, - "routers_loss": 0.0011548360344022512, + "routers_loss": 0.0012294676853343844, "skip_count": 0.0, "step": 2992, "text_loss": 0.40157821774482727 @@ -28441,13 +28441,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.056396484375, + "grad_norm": 0.052734375, "learning_rate": 0.0008583893633782612, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 4828841.0, "repeat_count": 0.0, - "routers_loss": 0.01307896338403225, + "routers_loss": 0.011474622413516045, "skip_count": 2.0, "step": 2994, "text_loss": 0.14842072129249573 @@ -28460,13 +28460,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.058837890625, "learning_rate": 0.0008581734686419999, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4831458.0, "repeat_count": 0.0, - "routers_loss": 0.009716883301734924, + "routers_loss": 0.009154081344604492, "skip_count": 2.0, "step": 2996, "text_loss": 0.365400105714798 @@ -28479,13 +28479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.031982421875, "learning_rate": 0.00085795743665299, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4834609.0, "repeat_count": 0.0, - "routers_loss": 0.0026114562060683966, + "routers_loss": 0.002899336162954569, "skip_count": 0.0, "step": 2998, "text_loss": 0.5574684143066406 @@ -28498,13 +28498,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.0517578125, "learning_rate": 0.0008577412674940152, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4838324.0, "repeat_count": 0.0, - "routers_loss": 0.003787368768826127, + "routers_loss": 0.0034664268605411053, "skip_count": 0.0, "step": 3000, "text_loss": 0.6752855777740479 @@ -28517,13 +28517,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.03466796875, "learning_rate": 0.0008575249612479117, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 4841877.0, "repeat_count": 0.0, - "routers_loss": 0.004202218260616064, + "routers_loss": 0.0036425739526748657, "skip_count": 2.0, "step": 3002, "text_loss": 0.6332980394363403 @@ -28536,13 +28536,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.048095703125, "learning_rate": 0.0008573085179975685, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4845840.0, "repeat_count": 0.0, - "routers_loss": 0.0012371218763291836, + "routers_loss": 0.0013783496106043458, "skip_count": 0.0, "step": 3004, "text_loss": 0.4219617545604706 @@ -28555,13 +28555,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.03857421875, "learning_rate": 0.0008570919378259274, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4848766.0, "repeat_count": 0.0, - "routers_loss": 0.005013706628233194, + "routers_loss": 0.004823608323931694, "skip_count": 1.0, "step": 3006, "text_loss": 0.7987180948257446 @@ -28574,13 +28574,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.0302734375, "learning_rate": 0.000856875220815982, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4852310.0, "repeat_count": 0.0, - "routers_loss": 0.001336073037236929, + "routers_loss": 0.0014760984340682626, "skip_count": 0.0, "step": 3008, "text_loss": 0.35592713952064514 @@ -28593,13 +28593,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.035400390625, "learning_rate": 0.0008566583670507788, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4856146.0, "repeat_count": 0.0, - "routers_loss": 0.003256940981373191, + "routers_loss": 0.0031717263627797365, "skip_count": 1.0, "step": 3010, "text_loss": 0.19379083812236786 @@ -28612,13 +28612,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.0517578125, "learning_rate": 0.0008564413766134164, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 4859386.0, "repeat_count": 0.0, - "routers_loss": 0.0038389062974601984, + "routers_loss": 0.003361492184922099, "skip_count": 0.0, "step": 3012, "text_loss": 0.39129266142845154 @@ -28631,13 +28631,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052734375, + "grad_norm": 0.048583984375, "learning_rate": 0.0008562242495870463, - "loss": 0.0119, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4862661.0, "repeat_count": 0.0, - "routers_loss": 0.0007799214799888432, + "routers_loss": 0.0010563990799710155, "skip_count": 0.0, "step": 3014, "text_loss": 0.5966938734054565 @@ -28650,13 +28650,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0213623046875, + "grad_norm": 0.0234375, "learning_rate": 0.0008560069860548716, - "loss": 0.006, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4865410.0, "repeat_count": 0.0, - "routers_loss": 0.0010348912328481674, + "routers_loss": 0.001233913702890277, "skip_count": 0.0, "step": 3016, "text_loss": 0.3386077880859375 @@ -28669,13 +28669,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.055419921875, "learning_rate": 0.0008557895861001484, - "loss": 0.006, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4868931.0, "repeat_count": 0.0, - "routers_loss": 0.0018167694797739387, + "routers_loss": 0.0018066301709041, "skip_count": 0.0, "step": 3018, "text_loss": 0.5222050547599792 @@ -28688,13 +28688,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.039306640625, "learning_rate": 0.0008555720498061845, - "loss": 0.0078, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4873492.0, "repeat_count": 0.0, - "routers_loss": 0.005788089707493782, + "routers_loss": 0.0050385501235723495, "skip_count": 1.0, "step": 3020, "text_loss": 0.4558849334716797 @@ -28707,13 +28707,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.048828125, "learning_rate": 0.0008553543772563403, - "loss": 0.0092, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4877026.0, "repeat_count": 0.0, - "routers_loss": 0.004194240085780621, + "routers_loss": 0.004828717093914747, "skip_count": 0.0, "step": 3022, "text_loss": 0.36598992347717285 @@ -28726,13 +28726,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.05712890625, + "grad_norm": 0.06103515625, "learning_rate": 0.0008551365685340285, "loss": 0.0084, "macro_f1": 0.9555556178092957, "num_tokens": 4879655.0, "repeat_count": 1.0, - "routers_loss": 0.019211066886782646, + "routers_loss": 0.02049369551241398, "skip_count": 5.0, "step": 3024, "text_loss": 0.5069093704223633 @@ -28745,13 +28745,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0478515625, + "grad_norm": 0.043212890625, "learning_rate": 0.0008549186237227138, - "loss": 0.0092, + "loss": 0.0088, "macro_f1": 0.8823530077934265, "num_tokens": 4882606.0, "repeat_count": 1.0, - "routers_loss": 0.041074834764003754, + "routers_loss": 0.03947242721915245, "skip_count": 2.0, "step": 3026, "text_loss": 0.2600715458393097 @@ -28764,13 +28764,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.030029296875, "learning_rate": 0.0008547005429059128, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4885246.0, "repeat_count": 2.0, - "routers_loss": 0.0027008953038603067, + "routers_loss": 0.0026363315992057323, "skip_count": 0.0, "step": 3028, "text_loss": 0.37642326951026917 @@ -28783,13 +28783,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.048828125, "learning_rate": 0.0008544823261671948, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4888109.0, "repeat_count": 0.0, - "routers_loss": 0.00402502017095685, + "routers_loss": 0.003858231008052826, "skip_count": 0.0, "step": 3030, "text_loss": 0.5875385999679565 @@ -28802,13 +28802,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.061279296875, "learning_rate": 0.0008542639735901804, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4891168.0, "repeat_count": 1.0, - "routers_loss": 0.00628731120377779, + "routers_loss": 0.004789089784026146, "skip_count": 1.0, "step": 3032, "text_loss": 0.6417325139045715 @@ -28821,32 +28821,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.035888671875, "learning_rate": 0.0008540454852585434, - "loss": 0.0117, + "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 4894355.0, "repeat_count": 0.0, - "routers_loss": 0.007284072227776051, + "routers_loss": 0.007334680762141943, "skip_count": 2.0, "step": 3034, "text_loss": 0.23697198927402496 }, { "acc_repeat": 0.0, - "acc_skip": 0.6666666865348816, - "avg_layers": 26.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, "epoch": 14.253595538597006, - "f1_execute": 0.9803921580314636, + "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, - "f1_skip": 0.800000011920929, - "grad_norm": 0.033203125, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, "learning_rate": 0.0008538268612560084, - "loss": 0.0059, - "macro_f1": 0.5934640765190125, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, "num_tokens": 4897543.0, "repeat_count": 0.0, - "routers_loss": 0.020328659564256668, + "routers_loss": 0.022096361964941025, "skip_count": 3.0, "step": 3036, "text_loss": 0.1989550143480301 @@ -28859,13 +28859,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.047119140625, "learning_rate": 0.0008536081016663527, - "loss": 0.0102, + "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4900752.0, "repeat_count": 1.0, - "routers_loss": 0.002338571473956108, + "routers_loss": 0.0037680594250559807, "skip_count": 2.0, "step": 3038, "text_loss": 0.5001366138458252 @@ -28878,13 +28878,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.0400390625, "learning_rate": 0.0008533892065734055, - "loss": 0.0083, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4903581.0, "repeat_count": 0.0, - "routers_loss": 0.003033763263374567, + "routers_loss": 0.0032373068388551474, "skip_count": 1.0, "step": 3040, "text_loss": 0.5019411444664001 @@ -28897,13 +28897,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.042724609375, "learning_rate": 0.0008531701760610476, - "loss": 0.012, + "loss": 0.0121, "macro_f1": 1.0, "num_tokens": 4907108.0, "repeat_count": 1.0, - "routers_loss": 0.00831629242748022, + "routers_loss": 0.0078013185411691666, "skip_count": 2.0, "step": 3042, "text_loss": 0.3460627794265747 @@ -28916,13 +28916,13 @@ "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, - "grad_norm": 0.04736328125, + "grad_norm": 0.04833984375, "learning_rate": 0.000852951010213212, - "loss": 0.0087, + "loss": 0.0089, "macro_f1": 0.8200000524520874, "num_tokens": 4911269.0, "repeat_count": 1.0, - "routers_loss": 0.03200878947973251, + "routers_loss": 0.03576689213514328, "skip_count": 3.0, "step": 3044, "text_loss": 0.268994003534317 @@ -28935,13 +28935,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.02685546875, "learning_rate": 0.0008527317091138835, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4914203.0, "repeat_count": 1.0, - "routers_loss": 0.003899211063981056, + "routers_loss": 0.0032140621915459633, "skip_count": 1.0, "step": 3046, "text_loss": 0.9998719692230225 @@ -28954,13 +28954,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.040771484375, "learning_rate": 0.0008525122728470987, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4918562.0, "repeat_count": 1.0, - "routers_loss": 0.00883556716144085, + "routers_loss": 0.008559177629649639, "skip_count": 3.0, "step": 3048, "text_loss": 0.3062439560890198 @@ -28973,13 +28973,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03125, "learning_rate": 0.0008522927014969459, - "loss": 0.0064, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4921940.0, "repeat_count": 0.0, - "routers_loss": 0.009054492227733135, + "routers_loss": 0.008735597133636475, "skip_count": 2.0, "step": 3050, "text_loss": 0.3637430965900421 @@ -28992,13 +28992,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.05517578125, "learning_rate": 0.0008520729951475652, - "loss": 0.0082, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 4925416.0, "repeat_count": 0.0, - "routers_loss": 0.0011907420121133327, + "routers_loss": 0.0012709591537714005, "skip_count": 0.0, "step": 3052, "text_loss": 0.542036235332489 @@ -29011,13 +29011,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0634765625, + "grad_norm": 0.06640625, "learning_rate": 0.0008518531538831488, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4928695.0, "repeat_count": 0.0, - "routers_loss": 0.0013618353987112641, + "routers_loss": 0.0010660928674042225, "skip_count": 1.0, "step": 3054, "text_loss": 0.43144503235816956 @@ -29030,13 +29030,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.060546875, + "grad_norm": 0.059326171875, "learning_rate": 0.00085163317778794, - "loss": 0.0102, + "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 4931504.0, "repeat_count": 0.0, - "routers_loss": 0.004202015232294798, + "routers_loss": 0.004558971151709557, "skip_count": 2.0, "step": 3056, "text_loss": 0.5257010459899902 @@ -29049,32 +29049,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.04931640625, "learning_rate": 0.0008514130669462341, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 4934935.0, "repeat_count": 0.0, - "routers_loss": 0.01060314942151308, + "routers_loss": 0.010774781927466393, "skip_count": 2.0, "step": 3058, "text_loss": 0.26061776280403137 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.366304666862343, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.0390625, "learning_rate": 0.0008511928214423782, "loss": 0.0103, - "macro_f1": 1.0, + "macro_f1": 0.6601307392120361, "num_tokens": 4938047.0, "repeat_count": 1.0, - "routers_loss": 0.012400983832776546, + "routers_loss": 0.014763157814741135, "skip_count": 2.0, "step": 3060, "text_loss": 0.2856905460357666 @@ -29087,13 +29087,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.046875, + "grad_norm": 0.050048828125, "learning_rate": 0.0008509724413607705, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 4941041.0, "repeat_count": 1.0, - "routers_loss": 0.004353851079940796, + "routers_loss": 0.004613345488905907, "skip_count": 0.0, "step": 3062, "text_loss": 0.2870287001132965 @@ -29106,13 +29106,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.06298828125, "learning_rate": 0.0008507519267858612, - "loss": 0.0148, + "loss": 0.015, "macro_f1": 1.0, "num_tokens": 4944708.0, "repeat_count": 1.0, - "routers_loss": 0.009858032688498497, + "routers_loss": 0.008584189228713512, "skip_count": 2.0, "step": 3064, "text_loss": 0.15828095376491547 @@ -29125,13 +29125,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.029052734375, "learning_rate": 0.0008505312778021519, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4948295.0, "repeat_count": 0.0, - "routers_loss": 0.0016502789221704006, + "routers_loss": 0.0014670816017314792, "skip_count": 0.0, "step": 3066, "text_loss": 0.36697930097579956 @@ -29144,13 +29144,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08642578125, + "grad_norm": 0.0927734375, "learning_rate": 0.0008503104944941958, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 4951983.0, "repeat_count": 0.0, - "routers_loss": 0.00573746208101511, + "routers_loss": 0.005348859820514917, "skip_count": 2.0, "step": 3068, "text_loss": 0.21612997353076935 @@ -29163,13 +29163,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.0654296875, "learning_rate": 0.0008500895769465972, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 0.3333333432674408, "num_tokens": 4955023.0, "repeat_count": 0.0, - "routers_loss": 0.0012014979729428887, + "routers_loss": 0.0013203793205320835, "skip_count": 0.0, "step": 3070, "text_loss": 0.9757798314094543 @@ -29182,13 +29182,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.0478515625, "learning_rate": 0.0008498685252440124, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4957600.0, "repeat_count": 0.0, - "routers_loss": 0.006400141399353743, + "routers_loss": 0.006907356437295675, "skip_count": 0.0, "step": 3072, "text_loss": 0.356107234954834 @@ -29201,13 +29201,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.061279296875, "learning_rate": 0.0008496473394711487, - "loss": 0.0117, + "loss": 0.0116, "macro_f1": 0.6666666865348816, "num_tokens": 4960746.0, "repeat_count": 0.0, - "routers_loss": 0.0030972862150520086, + "routers_loss": 0.0027704904787242413, "skip_count": 1.0, "step": 3074, "text_loss": 0.6812908053398132 @@ -29220,13 +29220,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.0576171875, "learning_rate": 0.0008494260197127649, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 4963845.0, "repeat_count": 0.0, - "routers_loss": 0.004087577573955059, + "routers_loss": 0.0036796489730477333, "skip_count": 2.0, "step": 3076, "text_loss": 0.7215370535850525 @@ -29239,13 +29239,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.0556640625, "learning_rate": 0.0008492045660536712, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4966887.0, "repeat_count": 0.0, - "routers_loss": 0.003797230776399374, + "routers_loss": 0.0037137691397219896, "skip_count": 1.0, "step": 3078, "text_loss": 0.8700299859046936 @@ -29258,13 +29258,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.036865234375, + "grad_norm": 0.03857421875, "learning_rate": 0.0008489829785787291, - "loss": 0.0081, + "loss": 0.0078, "macro_f1": 0.8823530077934265, "num_tokens": 4969859.0, "repeat_count": 1.0, - "routers_loss": 0.020377423614263535, + "routers_loss": 0.016492314636707306, "skip_count": 2.0, "step": 3080, "text_loss": 0.6520360112190247 @@ -29277,13 +29277,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.043701171875, "learning_rate": 0.0008487612573728513, - "loss": 0.0096, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 4972628.0, "repeat_count": 0.0, - "routers_loss": 0.003695295425131917, + "routers_loss": 0.004022917244583368, "skip_count": 2.0, "step": 3082, "text_loss": 0.17498187720775604 @@ -29296,13 +29296,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008485394025210016, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4975475.0, "repeat_count": 0.0, - "routers_loss": 0.008704355917870998, + "routers_loss": 0.009141159243881702, "skip_count": 1.0, "step": 3084, "text_loss": 0.5975366234779358 @@ -29315,13 +29315,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.045166015625, "learning_rate": 0.0008483174141081956, - "loss": 0.0111, + "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 4978858.0, "repeat_count": 0.0, - "routers_loss": 0.0031532018911093473, + "routers_loss": 0.0031561285723000765, "skip_count": 0.0, "step": 3086, "text_loss": 0.18748866021633148 @@ -29334,13 +29334,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.04150390625, "learning_rate": 0.0008480952922194991, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4982142.0, "repeat_count": 0.0, - "routers_loss": 0.0007620530668646097, + "routers_loss": 0.0007894713780842721, "skip_count": 0.0, "step": 3088, "text_loss": 0.42083197832107544 @@ -29353,13 +29353,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.0419921875, "learning_rate": 0.0008478730369400302, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 4984872.0, "repeat_count": 0.0, - "routers_loss": 0.000692489615175873, + "routers_loss": 0.0005908289458602667, "skip_count": 0.0, "step": 3090, "text_loss": 0.45337188243865967 @@ -29372,13 +29372,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.02392578125, "learning_rate": 0.0008476506483549573, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 4988137.0, "repeat_count": 1.0, - "routers_loss": 0.001856967923231423, + "routers_loss": 0.0016509373672306538, "skip_count": 2.0, "step": 3092, "text_loss": 0.6397262811660767 @@ -29391,13 +29391,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.036865234375, "learning_rate": 0.0008474281265495002, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4991164.0, "repeat_count": 0.0, - "routers_loss": 0.004027622286230326, + "routers_loss": 0.004088304936885834, "skip_count": 1.0, "step": 3094, "text_loss": 0.18352322280406952 @@ -29410,32 +29410,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.0380859375, "learning_rate": 0.0008472054716089295, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4993876.0, "repeat_count": 0.0, - "routers_loss": 0.004844399634748697, + "routers_loss": 0.005200014915317297, "skip_count": 0.0, "step": 3096, "text_loss": 0.2776511013507843 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.0, "acc_skip": 1.0, - "avg_layers": 27.0, + "avg_layers": 26.0, "epoch": 14.544760786615791, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0322265625, "learning_rate": 0.0008469826836185673, "loss": 0.01, - "macro_f1": 1.0, + "macro_f1": 0.6601307392120361, "num_tokens": 4997068.0, "repeat_count": 1.0, - "routers_loss": 0.012379852123558521, + "routers_loss": 0.012686059810221195, "skip_count": 2.0, "step": 3098, "text_loss": 0.23209233582019806 @@ -29448,13 +29448,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.055419921875, "learning_rate": 0.0008467597626637858, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 5000038.0, "repeat_count": 1.0, - "routers_loss": 0.00575951999053359, + "routers_loss": 0.006401528604328632, "skip_count": 2.0, "step": 3100, "text_loss": 0.45936745405197144 @@ -29467,13 +29467,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008465367088300093, "loss": 0.0075, "macro_f1": 0.3272727429866791, "num_tokens": 5002870.0, "repeat_count": 0.0, - "routers_loss": 0.013157932087779045, + "routers_loss": 0.016640547662973404, "skip_count": 1.0, "step": 3102, "text_loss": 0.44502779841423035 @@ -29486,13 +29486,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0272216796875, "learning_rate": 0.0008463135222027124, - "loss": 0.0052, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5006357.0, "repeat_count": 0.0, - "routers_loss": 0.008679390884935856, + "routers_loss": 0.008411331102252007, "skip_count": 2.0, "step": 3104, "text_loss": 0.3414570391178131 @@ -29505,13 +29505,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03076171875, "learning_rate": 0.0008460902028674204, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5009059.0, "repeat_count": 0.0, - "routers_loss": 0.001076352084055543, + "routers_loss": 0.0010406570509076118, "skip_count": 0.0, "step": 3106, "text_loss": 0.5931221842765808 @@ -29524,13 +29524,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.0322265625, "learning_rate": 0.0008458667509097098, - "loss": 0.0112, + "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5012327.0, "repeat_count": 0.0, - "routers_loss": 0.0021328055299818516, + "routers_loss": 0.001959054498001933, "skip_count": 0.0, "step": 3108, "text_loss": 0.5191171169281006 @@ -29543,13 +29543,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07470703125, + "grad_norm": 0.06640625, "learning_rate": 0.0008456431664152078, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 5015472.0, "repeat_count": 0.0, - "routers_loss": 0.0010206506121903658, + "routers_loss": 0.000994380097836256, "skip_count": 0.0, "step": 3110, "text_loss": 0.4455361068248749 @@ -29562,13 +29562,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0263671875, + "grad_norm": 0.0264892578125, "learning_rate": 0.0008454194494695923, - "loss": 0.0111, + "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 5018901.0, "repeat_count": 0.0, - "routers_loss": 0.0041310288943350315, + "routers_loss": 0.0037662344984710217, "skip_count": 0.0, "step": 3112, "text_loss": 0.5335362553596497 @@ -29581,13 +29581,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.02294921875, "learning_rate": 0.0008451956001585923, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5022520.0, "repeat_count": 0.0, - "routers_loss": 0.00994859915226698, + "routers_loss": 0.008664715103805065, "skip_count": 3.0, "step": 3114, "text_loss": 0.16230148077011108 @@ -29600,13 +29600,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.0498046875, "learning_rate": 0.000844971618567987, - "loss": 0.0087, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 5025505.0, "repeat_count": 0.0, - "routers_loss": 0.0016823343466967344, + "routers_loss": 0.0015904927859082818, "skip_count": 0.0, "step": 3116, "text_loss": 0.6989432573318481 @@ -29619,13 +29619,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.033935546875, "learning_rate": 0.0008447475047836068, - "loss": 0.0061, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 5028767.0, "repeat_count": 0.0, - "routers_loss": 0.005725692491978407, + "routers_loss": 0.005853322334587574, "skip_count": 1.0, "step": 3118, "text_loss": 0.31420737504959106 @@ -29638,13 +29638,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.05615234375, "learning_rate": 0.0008445232588913325, - "loss": 0.0116, + "loss": 0.0115, "macro_f1": 0.3272727429866791, "num_tokens": 5032577.0, "repeat_count": 0.0, - "routers_loss": 0.016534095630049706, + "routers_loss": 0.012760105542838573, "skip_count": 0.0, "step": 3120, "text_loss": 0.5534627437591553 @@ -29657,13 +29657,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.049072265625, "learning_rate": 0.0008442988809770953, - "loss": 0.0097, + "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 5035381.0, "repeat_count": 0.0, - "routers_loss": 0.0023590524215251207, + "routers_loss": 0.0022257440723478794, "skip_count": 0.0, "step": 3122, "text_loss": 0.42492759227752686 @@ -29676,13 +29676,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03955078125, "learning_rate": 0.0008440743711268775, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5038743.0, "repeat_count": 0.0, - "routers_loss": 0.004739012103527784, + "routers_loss": 0.004648433532565832, "skip_count": 0.0, "step": 3124, "text_loss": 0.16404685378074646 @@ -29695,13 +29695,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.03955078125, "learning_rate": 0.0008438497294267117, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5041492.0, "repeat_count": 0.0, - "routers_loss": 0.006212939508259296, + "routers_loss": 0.006313877180218697, "skip_count": 0.0, "step": 3126, "text_loss": 0.23191484808921814 @@ -29714,13 +29714,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.068359375, + "grad_norm": 0.07666015625, "learning_rate": 0.0008436249559626807, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5043955.0, "repeat_count": 1.0, - "routers_loss": 0.0036408400628715754, + "routers_loss": 0.0036270488053560257, "skip_count": 0.0, "step": 3128, "text_loss": 0.5782018303871155 @@ -29733,13 +29733,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04345703125, "learning_rate": 0.0008434000508209187, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5047571.0, "repeat_count": 0.0, - "routers_loss": 0.0038875883910804987, + "routers_loss": 0.003809858812019229, "skip_count": 1.0, "step": 3130, "text_loss": 0.7129825949668884 @@ -29752,13 +29752,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.03955078125, "learning_rate": 0.0008431750140876092, - "loss": 0.0129, + "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 5051608.0, "repeat_count": 0.0, - "routers_loss": 0.002172809559851885, + "routers_loss": 0.0022369057405740023, "skip_count": 0.0, "step": 3132, "text_loss": 0.4433445930480957 @@ -29773,11 +29773,11 @@ "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.000842949845848987, - "loss": 0.0134, + "loss": 0.0135, "macro_f1": 0.32098764181137085, "num_tokens": 5054656.0, "repeat_count": 0.0, - "routers_loss": 0.04427836462855339, + "routers_loss": 0.0425117202103138, "skip_count": 2.0, "step": 3134, "text_loss": 0.38721024990081787 @@ -29790,13 +29790,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.0712890625, "learning_rate": 0.0008427245461913368, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 5059108.0, "repeat_count": 0.0, - "routers_loss": 0.0016648605233058333, + "routers_loss": 0.0018077283166348934, "skip_count": 0.0, "step": 3136, "text_loss": 0.7496368885040283 @@ -29809,13 +29809,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.1142578125, + "grad_norm": 0.12109375, "learning_rate": 0.0008424991152009941, - "loss": 0.0113, + "loss": 0.0111, "macro_f1": 1.0, "num_tokens": 5062371.0, "repeat_count": 1.0, - "routers_loss": 0.008457986637949944, + "routers_loss": 0.008801834657788277, "skip_count": 2.0, "step": 3138, "text_loss": 0.5337086319923401 @@ -29828,13 +29828,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.04296875, "learning_rate": 0.0008422735529643444, - "loss": 0.0099, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5065593.0, "repeat_count": 0.0, - "routers_loss": 0.004939604084938765, + "routers_loss": 0.00548676960170269, "skip_count": 3.0, "step": 3140, "text_loss": 0.2561623156070709 @@ -29847,13 +29847,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.032958984375, "learning_rate": 0.0008420478595678233, - "loss": 0.0077, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5068271.0, "repeat_count": 0.0, - "routers_loss": 0.006254551466554403, + "routers_loss": 0.006389956455677748, "skip_count": 0.0, "step": 3142, "text_loss": 0.15605193376541138 @@ -29866,13 +29866,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0693359375, + "grad_norm": 0.07958984375, "learning_rate": 0.0008418220350979175, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 5071358.0, "repeat_count": 1.0, - "routers_loss": 0.01132921315729618, + "routers_loss": 0.012387622147798538, "skip_count": 2.0, "step": 3144, "text_loss": 0.3085838258266449 @@ -29885,13 +29885,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.033447265625, "learning_rate": 0.0008415960796411628, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5075584.0, "repeat_count": 0.0, - "routers_loss": 0.0026424501556903124, + "routers_loss": 0.00311864772811532, "skip_count": 1.0, "step": 3146, "text_loss": 0.4786977469921112 @@ -29904,13 +29904,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.103515625, + "grad_norm": 0.1591796875, "learning_rate": 0.0008413699932841461, - "loss": 0.0093, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5078388.0, "repeat_count": 0.0, - "routers_loss": 0.0036633017007261515, + "routers_loss": 0.0030679800547659397, "skip_count": 0.0, "step": 3148, "text_loss": 0.5222916603088379 @@ -29923,13 +29923,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.0390625, "learning_rate": 0.0008411437761135039, - "loss": 0.0112, + "loss": 0.011, "macro_f1": 1.0, "num_tokens": 5081584.0, "repeat_count": 1.0, - "routers_loss": 0.012777967378497124, + "routers_loss": 0.012907958589494228, "skip_count": 2.0, "step": 3150, "text_loss": 0.5369884371757507 @@ -29942,13 +29942,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.03759765625, "learning_rate": 0.0008409174282159232, - "loss": 0.0074, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5084450.0, "repeat_count": 0.0, - "routers_loss": 0.013694444671273232, + "routers_loss": 0.012314042076468468, "skip_count": 2.0, "step": 3152, "text_loss": 0.25685277581214905 @@ -29961,13 +29961,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.041015625, "learning_rate": 0.000840690949678141, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5087865.0, "repeat_count": 1.0, - "routers_loss": 0.008412595838308334, + "routers_loss": 0.00899206381291151, "skip_count": 0.0, "step": 3154, "text_loss": 0.1717093288898468 @@ -29980,13 +29980,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.06103515625, "learning_rate": 0.0008404643405869441, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5090857.0, "repeat_count": 0.0, - "routers_loss": 0.0011648585787042975, + "routers_loss": 0.0013312003575265408, "skip_count": 0.0, "step": 3156, "text_loss": 0.27446436882019043 @@ -29999,13 +29999,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.1630859375, + "grad_norm": 0.1533203125, "learning_rate": 0.0008402376010291695, - "loss": 0.0127, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 5093917.0, "repeat_count": 0.0, - "routers_loss": 0.002915408927947283, + "routers_loss": 0.002653320087119937, "skip_count": 0.0, "step": 3158, "text_loss": 0.4237489402294159 @@ -30018,13 +30018,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0498046875, + "grad_norm": 0.045654296875, "learning_rate": 0.0008400107310917045, - "loss": 0.0096, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5096656.0, "repeat_count": 0.0, - "routers_loss": 0.013139770366251469, + "routers_loss": 0.012976993806660175, "skip_count": 2.0, "step": 3160, "text_loss": 0.42361980676651 @@ -30037,13 +30037,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.0634765625, "learning_rate": 0.000839783730861486, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5099582.0, "repeat_count": 0.0, - "routers_loss": 0.0070426687598228455, + "routers_loss": 0.006936746649444103, "skip_count": 2.0, "step": 3162, "text_loss": 0.26656073331832886 @@ -30056,13 +30056,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.05908203125, "learning_rate": 0.0008395566004255008, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 5102908.0, "repeat_count": 0.0, - "routers_loss": 0.006271707359701395, + "routers_loss": 0.006619359832257032, "skip_count": 1.0, "step": 3164, "text_loss": 0.590774416923523 @@ -30075,13 +30075,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.06884765625, "learning_rate": 0.0008393293398707858, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5105829.0, "repeat_count": 0.0, - "routers_loss": 0.010571467690169811, + "routers_loss": 0.010120268911123276, "skip_count": 2.0, "step": 3166, "text_loss": 0.605930507183075 @@ -30094,13 +30094,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.0419921875, "learning_rate": 0.0008391019492844275, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5109850.0, "repeat_count": 0.0, - "routers_loss": 0.005877034272998571, + "routers_loss": 0.004940980114042759, "skip_count": 2.0, "step": 3168, "text_loss": 0.12973152101039886 @@ -30115,11 +30115,11 @@ "f1_skip": 1.0, "grad_norm": 0.037353515625, "learning_rate": 0.0008388744287535627, - "loss": 0.0093, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5113353.0, "repeat_count": 0.0, - "routers_loss": 0.0031909283716231585, + "routers_loss": 0.0031777634285390377, "skip_count": 1.0, "step": 3170, "text_loss": 0.18577200174331665 @@ -30132,13 +30132,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.052734375, "learning_rate": 0.0008386467783653775, - "loss": 0.0104, + "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 5116421.0, "repeat_count": 0.0, - "routers_loss": 0.005338824819773436, + "routers_loss": 0.005431659985333681, "skip_count": 0.0, "step": 3172, "text_loss": 0.2302747517824173 @@ -30151,13 +30151,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.046142578125, "learning_rate": 0.000838418998207108, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5119457.0, "repeat_count": 0.0, - "routers_loss": 0.008522412739694118, + "routers_loss": 0.0077286697924137115, "skip_count": 4.0, "step": 3174, "text_loss": 0.19606637954711914 @@ -30170,13 +30170,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.050537109375, "learning_rate": 0.0008381910883660399, - "loss": 0.0068, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5123201.0, "repeat_count": 0.0, - "routers_loss": 0.0035330590326339006, + "routers_loss": 0.003982985392212868, "skip_count": 0.0, "step": 3176, "text_loss": 0.716376006603241 @@ -30189,13 +30189,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09375, + "grad_norm": 0.09423828125, "learning_rate": 0.0008379630489295089, - "loss": 0.0106, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5126035.0, "repeat_count": 0.0, - "routers_loss": 0.006332095246762037, + "routers_loss": 0.005626026075333357, "skip_count": 1.0, "step": 3178, "text_loss": 0.5144625902175903 @@ -30208,13 +30208,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05859375, + "grad_norm": 0.05615234375, "learning_rate": 0.0008377348799849, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5129179.0, "repeat_count": 0.0, - "routers_loss": 0.017295993864536285, + "routers_loss": 0.015458245761692524, "skip_count": 2.0, "step": 3180, "text_loss": 0.29887503385543823 @@ -30227,13 +30227,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0703125, + "grad_norm": 0.062255859375, "learning_rate": 0.0008375065816196479, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 5132149.0, "repeat_count": 0.0, - "routers_loss": 0.017241213470697403, + "routers_loss": 0.012210468761622906, "skip_count": 2.0, "step": 3182, "text_loss": 0.8981851935386658 @@ -30246,13 +30246,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.044677734375, "learning_rate": 0.0008372781539212371, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5135287.0, "repeat_count": 0.0, - "routers_loss": 0.00516276340931654, + "routers_loss": 0.0052537876181304455, "skip_count": 0.0, "step": 3184, "text_loss": 0.4245666563510895 @@ -30265,13 +30265,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0240478515625, "learning_rate": 0.0008370495969772014, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5138589.0, "repeat_count": 0.0, - "routers_loss": 0.012517380528151989, + "routers_loss": 0.012873421423137188, "skip_count": 2.0, "step": 3186, "text_loss": 0.40581050515174866 @@ -30284,13 +30284,13 @@ "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.07470703125, "learning_rate": 0.0008368209108751244, - "loss": 0.0129, + "loss": 0.0127, "macro_f1": 0.6521739363670349, "num_tokens": 5141635.0, "repeat_count": 2.0, - "routers_loss": 0.0810512825846672, + "routers_loss": 0.07720445841550827, "skip_count": 4.0, "step": 3188, "text_loss": 0.3755173981189728 @@ -30303,13 +30303,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.01953125, + "grad_norm": 0.02197265625, "learning_rate": 0.0008365920957026389, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5144728.0, "repeat_count": 0.0, - "routers_loss": 0.0014350182609632611, + "routers_loss": 0.001440995605662465, "skip_count": 0.0, "step": 3190, "text_loss": 0.5067034363746643 @@ -30322,13 +30322,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.046142578125, + "grad_norm": 0.041748046875, "learning_rate": 0.0008363631515474275, - "loss": 0.0091, + "loss": 0.0089, "macro_f1": 0.6538461446762085, "num_tokens": 5147963.0, "repeat_count": 1.0, - "routers_loss": 0.018022676929831505, + "routers_loss": 0.018752984702587128, "skip_count": 2.0, "step": 3192, "text_loss": 0.20224551856517792 @@ -30341,13 +30341,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.037353515625, "learning_rate": 0.0008361340784972217, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5151184.0, "repeat_count": 0.0, - "routers_loss": 0.0005097229732200503, + "routers_loss": 0.0005360354552976787, "skip_count": 0.0, "step": 3194, "text_loss": 0.4588058292865753 @@ -30360,13 +30360,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.0390625, "learning_rate": 0.0008359048766398031, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5153889.0, "repeat_count": 0.0, - "routers_loss": 0.0009840037673711777, + "routers_loss": 0.0009184491937048733, "skip_count": 1.0, "step": 3196, "text_loss": 0.2980220317840576 @@ -30379,13 +30379,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02685546875, + "grad_norm": 0.027099609375, "learning_rate": 0.000835675546063002, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5156758.0, "repeat_count": 0.0, - "routers_loss": 0.001269801170565188, + "routers_loss": 0.001252970308996737, "skip_count": 0.0, "step": 3198, "text_loss": 0.6775755882263184 @@ -30398,13 +30398,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.052490234375, "learning_rate": 0.0008354460868546985, - "loss": 0.0071, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5160247.0, "repeat_count": 0.0, - "routers_loss": 0.0034889329690486193, + "routers_loss": 0.0037315806839615107, "skip_count": 0.0, "step": 3200, "text_loss": 0.35867011547088623 @@ -30417,13 +30417,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.034912109375, "learning_rate": 0.0008352164991028217, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 5163456.0, "repeat_count": 1.0, - "routers_loss": 0.001520772697404027, + "routers_loss": 0.001497485558502376, "skip_count": 0.0, "step": 3202, "text_loss": 0.690290093421936 @@ -30436,13 +30436,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.04638671875, "learning_rate": 0.0008349867828953501, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 5166139.0, "repeat_count": 0.0, - "routers_loss": 0.0011800233041867614, + "routers_loss": 0.001051135826855898, "skip_count": 0.0, "step": 3204, "text_loss": 0.3340415954589844 @@ -30455,13 +30455,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.03076171875, "learning_rate": 0.0008347569383203113, - "loss": 0.01, + "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 5169009.0, "repeat_count": 0.0, - "routers_loss": 0.001043233904056251, + "routers_loss": 0.0010544003453105688, "skip_count": 0.0, "step": 3206, "text_loss": 0.8584878444671631 @@ -30474,13 +30474,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03662109375, "learning_rate": 0.0008345269654657823, - "loss": 0.0084, + "loss": 0.0085, "macro_f1": 1.0, "num_tokens": 5172618.0, "repeat_count": 1.0, - "routers_loss": 0.007460868917405605, + "routers_loss": 0.007312417030334473, "skip_count": 1.0, "step": 3208, "text_loss": 0.19500218331813812 @@ -30493,13 +30493,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.03466796875, "learning_rate": 0.0008342968644198892, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 5175857.0, "repeat_count": 0.0, - "routers_loss": 0.0027419133111834526, + "routers_loss": 0.00276504410430789, "skip_count": 0.0, "step": 3210, "text_loss": 0.5446314215660095 @@ -30512,13 +30512,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.037109375, "learning_rate": 0.0008340666352708068, - "loss": 0.0089, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5178585.0, "repeat_count": 0.0, - "routers_loss": 0.002764733275398612, + "routers_loss": 0.002669303445145488, "skip_count": 0.0, "step": 3212, "text_loss": 0.3687484860420227 @@ -30531,13 +30531,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.035888671875, "learning_rate": 0.0008338362781067596, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5181777.0, "repeat_count": 0.0, - "routers_loss": 0.0032288613729178905, + "routers_loss": 0.0031585274264216423, "skip_count": 0.0, "step": 3214, "text_loss": 0.27325859665870667 @@ -30550,13 +30550,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.04541015625, "learning_rate": 0.000833605793016021, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 5184312.0, "repeat_count": 0.0, - "routers_loss": 0.008322423323988914, + "routers_loss": 0.008807534351944923, "skip_count": 2.0, "step": 3216, "text_loss": 0.4466548562049866 @@ -30569,13 +30569,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008333751800869133, - "loss": 0.0092, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5187497.0, "repeat_count": 0.0, - "routers_loss": 0.0034384531900286674, + "routers_loss": 0.003171310294419527, "skip_count": 0.0, "step": 3218, "text_loss": 0.5423526763916016 @@ -30588,13 +30588,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.025634765625, "learning_rate": 0.0008331444394078076, - "loss": 0.0081, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5190982.0, "repeat_count": 0.0, - "routers_loss": 0.0015023534651845694, + "routers_loss": 0.0016481258207932115, "skip_count": 2.0, "step": 3220, "text_loss": 0.48984917998313904 @@ -30607,13 +30607,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03271484375, "learning_rate": 0.000832913571067124, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 5194044.0, "repeat_count": 1.0, - "routers_loss": 0.0043489462696015835, + "routers_loss": 0.003957313951104879, "skip_count": 1.0, "step": 3222, "text_loss": 0.4533331096172333 @@ -30626,13 +30626,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.040283203125, "learning_rate": 0.0008326825751533322, - "loss": 0.0076, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5197092.0, "repeat_count": 0.0, - "routers_loss": 0.0012065734481438994, + "routers_loss": 0.0016904744552448392, "skip_count": 0.0, "step": 3224, "text_loss": 0.5538802742958069 @@ -30645,13 +30645,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06005859375, + "grad_norm": 0.05224609375, "learning_rate": 0.0008324514517549501, - "loss": 0.0084, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5199941.0, "repeat_count": 0.0, - "routers_loss": 0.006849290337413549, + "routers_loss": 0.005608258303254843, "skip_count": 1.0, "step": 3226, "text_loss": 0.416242778301239 @@ -30664,32 +30664,32 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.03857421875, + "grad_norm": 0.040771484375, "learning_rate": 0.0008322202009605444, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.8823530077934265, "num_tokens": 5202618.0, "repeat_count": 1.0, - "routers_loss": 0.020665202289819717, + "routers_loss": 0.020965175703167915, "skip_count": 2.0, "step": 3228, "text_loss": 0.17496295273303986 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 1.0, - "avg_layers": 23.0, + "avg_layers": 24.0, "epoch": 15.164367478720282, - "f1_execute": 0.9777777791023254, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04052734375, "learning_rate": 0.0008319888228587311, "loss": 0.0063, - "macro_f1": 0.6592592597007751, + "macro_f1": 1.0, "num_tokens": 5206414.0, "repeat_count": 1.0, - "routers_loss": 0.026284674182534218, + "routers_loss": 0.021259209141135216, "skip_count": 5.0, "step": 3230, "text_loss": 0.22471418976783752 @@ -30702,13 +30702,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03076171875, + "grad_norm": 0.029541015625, "learning_rate": 0.0008317573175381745, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 5209768.0, "repeat_count": 0.0, - "routers_loss": 0.0018494570394977927, + "routers_loss": 0.0018647604156285524, "skip_count": 0.0, "step": 3232, "text_loss": 0.4415269196033478 @@ -30721,13 +30721,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0283203125, "learning_rate": 0.0008315256850875881, - "loss": 0.0061, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5213257.0, "repeat_count": 0.0, - "routers_loss": 0.002610588213428855, + "routers_loss": 0.002345515415072441, "skip_count": 0.0, "step": 3234, "text_loss": 0.347247838973999 @@ -30740,13 +30740,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048828125, + "grad_norm": 0.053955078125, "learning_rate": 0.0008312939255957336, - "loss": 0.0084, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 5215800.0, "repeat_count": 0.0, - "routers_loss": 0.007061914075165987, + "routers_loss": 0.007112892810255289, "skip_count": 3.0, "step": 3236, "text_loss": 0.31091734766960144 @@ -30759,13 +30759,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.033203125, "learning_rate": 0.0008310620391514219, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5219205.0, "repeat_count": 0.0, - "routers_loss": 0.004094691481441259, + "routers_loss": 0.00432228296995163, "skip_count": 0.0, "step": 3238, "text_loss": 0.3421775996685028 @@ -30778,13 +30778,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.027099609375, "learning_rate": 0.0008308300258435124, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 5222422.0, "repeat_count": 0.0, - "routers_loss": 0.007662596181035042, + "routers_loss": 0.0076514314860105515, "skip_count": 2.0, "step": 3240, "text_loss": 0.22378318011760712 @@ -30797,13 +30797,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0264892578125, + "grad_norm": 0.028564453125, "learning_rate": 0.0008305978857609128, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5225625.0, "repeat_count": 0.0, - "routers_loss": 0.0008108283509500325, + "routers_loss": 0.0007617069641128182, "skip_count": 0.0, "step": 3242, "text_loss": 0.5880323648452759 @@ -30816,13 +30816,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.02734375, "learning_rate": 0.0008303656189925799, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5229113.0, "repeat_count": 0.0, - "routers_loss": 0.0018137742299586535, + "routers_loss": 0.0017418119823560119, "skip_count": 0.0, "step": 3244, "text_loss": 0.3302813768386841 @@ -30835,13 +30835,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.042724609375, "learning_rate": 0.0008301332256275183, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5232061.0, "repeat_count": 0.0, - "routers_loss": 0.0025301240384578705, + "routers_loss": 0.0026667986530810595, "skip_count": 0.0, "step": 3246, "text_loss": 0.5679706335067749 @@ -30854,13 +30854,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.058349609375, "learning_rate": 0.0008299007057547821, - "loss": 0.0101, + "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5235279.0, "repeat_count": 1.0, - "routers_loss": 0.011231686919927597, + "routers_loss": 0.011016624979674816, "skip_count": 2.0, "step": 3248, "text_loss": 0.5081504583358765 @@ -30873,13 +30873,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.033203125, "learning_rate": 0.0008296680594634731, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5239655.0, "repeat_count": 1.0, - "routers_loss": 0.005881415214389563, + "routers_loss": 0.005492044147104025, "skip_count": 0.0, "step": 3250, "text_loss": 0.14675180613994598 @@ -30892,13 +30892,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.0269775390625, "learning_rate": 0.0008294352868427418, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5243579.0, "repeat_count": 0.0, - "routers_loss": 0.004495301283895969, + "routers_loss": 0.00404445780441165, "skip_count": 1.0, "step": 3252, "text_loss": 0.4201085865497589 @@ -30911,13 +30911,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0208740234375, + "grad_norm": 0.0242919921875, "learning_rate": 0.0008292023879817871, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5247059.0, "repeat_count": 0.0, - "routers_loss": 0.007394428364932537, + "routers_loss": 0.006886140909045935, "skip_count": 1.0, "step": 3254, "text_loss": 0.2289208322763443 @@ -30930,32 +30930,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06201171875, + "grad_norm": 0.057861328125, "learning_rate": 0.0008289693629698564, - "loss": 0.0077, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5249940.0, "repeat_count": 0.0, - "routers_loss": 0.0006736332434229553, + "routers_loss": 0.0005736657767556608, "skip_count": 0.0, "step": 3256, "text_loss": 0.5670450925827026 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 15.295861461696507, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.0224609375, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, "learning_rate": 0.0008287362118962452, - "loss": 0.0062, - "macro_f1": 0.6666666865348816, + "loss": 0.006, + "macro_f1": 0.3272727429866791, "num_tokens": 5253580.0, "repeat_count": 0.0, - "routers_loss": 0.009847268462181091, + "routers_loss": 0.011349895037710667, "skip_count": 1.0, "step": 3258, "text_loss": 0.5042323470115662 @@ -30968,13 +30968,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0267333984375, "learning_rate": 0.0008285029348502973, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5257080.0, "repeat_count": 0.0, - "routers_loss": 0.0013670918997377157, + "routers_loss": 0.0013626761501654983, "skip_count": 0.0, "step": 3260, "text_loss": 0.3227672874927521 @@ -30987,13 +30987,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0008282695319214053, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5259951.0, "repeat_count": 0.0, - "routers_loss": 0.004696785472333431, + "routers_loss": 0.00471635302528739, "skip_count": 0.0, "step": 3262, "text_loss": 0.20773714780807495 @@ -31006,13 +31006,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.039306640625, "learning_rate": 0.0008280360031990093, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 5263314.0, "repeat_count": 0.0, - "routers_loss": 0.010588239878416061, + "routers_loss": 0.010472415015101433, "skip_count": 2.0, "step": 3264, "text_loss": 0.34397366642951965 @@ -31025,13 +31025,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.036865234375, "learning_rate": 0.000827802348772598, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 5267358.0, "repeat_count": 0.0, - "routers_loss": 0.0010326795745640993, + "routers_loss": 0.0007814752752892673, "skip_count": 0.0, "step": 3266, "text_loss": 0.747342586517334 @@ -31044,13 +31044,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.0498046875, "learning_rate": 0.0008275685687317084, - "loss": 0.0087, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5270400.0, "repeat_count": 0.0, - "routers_loss": 0.0010199147509410977, + "routers_loss": 0.000902949133887887, "skip_count": 0.0, "step": 3268, "text_loss": 0.43782034516334534 @@ -31063,13 +31063,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.03564453125, "learning_rate": 0.0008273346631659252, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5273147.0, "repeat_count": 0.0, - "routers_loss": 0.00046372212818823755, + "routers_loss": 0.00043462219764478505, "skip_count": 0.0, "step": 3270, "text_loss": 0.6358205080032349 @@ -31082,13 +31082,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.04052734375, "learning_rate": 0.0008271006321648816, - "loss": 0.0088, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5277638.0, "repeat_count": 0.0, - "routers_loss": 0.0022951713763177395, + "routers_loss": 0.002211218234151602, "skip_count": 0.0, "step": 3272, "text_loss": 0.20220105350017548 @@ -31101,13 +31101,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.04638671875, "learning_rate": 0.0008268664758182589, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5280638.0, "repeat_count": 1.0, - "routers_loss": 0.008325734175741673, + "routers_loss": 0.010536720044910908, "skip_count": 0.0, "step": 3274, "text_loss": 0.7579061388969421 @@ -31120,32 +31120,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.0439453125, "learning_rate": 0.0008266321942157859, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5283847.0, "repeat_count": 0.0, - "routers_loss": 0.0017014809418469667, + "routers_loss": 0.0017158017726615071, "skip_count": 0.0, "step": 3276, "text_loss": 0.669302761554718 }, { - "acc_repeat": 1.0, + "acc_repeat": 0.800000011920929, "acc_skip": 1.0, - "avg_layers": 29.0, + "avg_layers": 28.0, "epoch": 15.389785735250953, - "f1_execute": 1.0, - "f1_repeat": 1.0, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, "f1_skip": 1.0, - "grad_norm": 0.06787109375, + "grad_norm": 0.06005859375, "learning_rate": 0.0008263977874472399, - "loss": 0.0089, - "macro_f1": 1.0, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, "num_tokens": 5286627.0, "repeat_count": 5.0, - "routers_loss": 0.009527196176350117, + "routers_loss": 0.011220700107514858, "skip_count": 4.0, "step": 3278, "text_loss": 0.8703984022140503 @@ -31158,13 +31158,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.060546875, + "grad_norm": 0.05615234375, "learning_rate": 0.0008261632556024461, - "loss": 0.01, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5289766.0, "repeat_count": 0.0, - "routers_loss": 0.0025269081816077232, + "routers_loss": 0.0020442772656679153, "skip_count": 0.0, "step": 3280, "text_loss": 0.5009346008300781 @@ -31177,13 +31177,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.11474609375, + "grad_norm": 0.10107421875, "learning_rate": 0.0008259285987712774, - "loss": 0.0108, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5293010.0, "repeat_count": 0.0, - "routers_loss": 0.005710822530090809, + "routers_loss": 0.005645765457302332, "skip_count": 0.0, "step": 3282, "text_loss": 0.2546011209487915 @@ -31196,13 +31196,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.042236328125, "learning_rate": 0.0008256938170436549, - "loss": 0.0114, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5296732.0, "repeat_count": 0.0, - "routers_loss": 0.0028946297243237495, + "routers_loss": 0.0027385836001485586, "skip_count": 2.0, "step": 3284, "text_loss": 0.5244000554084778 @@ -31217,11 +31217,11 @@ "f1_skip": 1.0, "grad_norm": 0.0296630859375, "learning_rate": 0.0008254589105095473, - "loss": 0.0059, + "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 5299926.0, "repeat_count": 1.0, - "routers_loss": 0.007981270551681519, + "routers_loss": 0.007451715879142284, "skip_count": 1.0, "step": 3286, "text_loss": 0.28979742527008057 @@ -31234,13 +31234,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0218505859375, "learning_rate": 0.0008252238792589711, - "loss": 0.0085, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5303006.0, "repeat_count": 0.0, - "routers_loss": 0.005524218548089266, + "routers_loss": 0.004805843345820904, "skip_count": 2.0, "step": 3288, "text_loss": 0.5131978392601013 @@ -31253,13 +31253,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.038818359375, "learning_rate": 0.000824988723381991, - "loss": 0.0092, + "loss": 0.0091, "macro_f1": 0.3272727429866791, "num_tokens": 5306953.0, "repeat_count": 0.0, - "routers_loss": 0.01160401664674282, + "routers_loss": 0.010639613494277, "skip_count": 1.0, "step": 3290, "text_loss": 0.4901447296142578 @@ -31272,13 +31272,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.033935546875, + "grad_norm": 0.044189453125, "learning_rate": 0.0008247534429687191, - "loss": 0.0069, + "loss": 0.007, "macro_f1": 0.5492662787437439, "num_tokens": 5310516.0, "repeat_count": 0.0, - "routers_loss": 0.014068983495235443, + "routers_loss": 0.013625577092170715, "skip_count": 2.0, "step": 3292, "text_loss": 0.2124534696340561 @@ -31291,13 +31291,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.041748046875, "learning_rate": 0.0008245180381093152, - "loss": 0.0116, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 5313959.0, "repeat_count": 0.0, - "routers_loss": 0.00520911393687129, + "routers_loss": 0.004958513658493757, "skip_count": 1.0, "step": 3294, "text_loss": 0.46682238578796387 @@ -31310,13 +31310,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008242825088939867, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5316609.0, "repeat_count": 0.0, - "routers_loss": 0.004490343388170004, + "routers_loss": 0.003962756600230932, "skip_count": 0.0, "step": 3296, "text_loss": 0.7010108232498169 @@ -31329,13 +31329,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.04052734375, "learning_rate": 0.0008240468554129892, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5319638.0, "repeat_count": 0.0, - "routers_loss": 0.0006864524912089109, + "routers_loss": 0.0006996620795689523, "skip_count": 0.0, "step": 3298, "text_loss": 0.4966355860233307 @@ -31348,13 +31348,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.0341796875, "learning_rate": 0.0008238110777566255, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 5323019.0, "repeat_count": 0.0, - "routers_loss": 0.0017158432165160775, + "routers_loss": 0.0016031896229833364, "skip_count": 0.0, "step": 3300, "text_loss": 0.38668957352638245 @@ -31367,13 +31367,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0303955078125, "learning_rate": 0.0008235751760152459, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5326099.0, "repeat_count": 2.0, - "routers_loss": 0.0037166383117437363, + "routers_loss": 0.00344281829893589, "skip_count": 2.0, "step": 3302, "text_loss": 0.5330720543861389 @@ -31386,13 +31386,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.06005859375, "learning_rate": 0.0008233391502792484, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5328993.0, "repeat_count": 0.0, - "routers_loss": 0.008341175504028797, + "routers_loss": 0.007886730134487152, "skip_count": 1.0, "step": 3304, "text_loss": 0.5470269322395325 @@ -31405,13 +31405,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.034423828125, "learning_rate": 0.0008231030006390786, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5331554.0, "repeat_count": 0.0, - "routers_loss": 0.008380163460969925, + "routers_loss": 0.008180000819265842, "skip_count": 1.0, "step": 3306, "text_loss": 0.4023340344429016 @@ -31424,13 +31424,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.02587890625, "learning_rate": 0.0008228667271852294, - "loss": 0.0062, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5335712.0, "repeat_count": 0.0, - "routers_loss": 0.00030099941068328917, + "routers_loss": 0.0002942821884062141, "skip_count": 0.0, "step": 3308, "text_loss": 0.5306711792945862 @@ -31443,13 +31443,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.05908203125, "learning_rate": 0.0008226303300082414, - "loss": 0.0095, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 5338701.0, "repeat_count": 0.0, - "routers_loss": 0.0006003376329317689, + "routers_loss": 0.0006134595023468137, "skip_count": 0.0, "step": 3310, "text_loss": 0.5906263589859009 @@ -31462,13 +31462,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.02880859375, "learning_rate": 0.0008223938091987022, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5342274.0, "repeat_count": 0.0, - "routers_loss": 0.0017984671285375953, + "routers_loss": 0.0016656654188409448, "skip_count": 0.0, "step": 3312, "text_loss": 0.5201764106750488 @@ -31481,13 +31481,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.052001953125, "learning_rate": 0.0008221571648472472, - "loss": 0.0066, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5345185.0, "repeat_count": 0.0, - "routers_loss": 0.003994898404926062, + "routers_loss": 0.0038612703792750835, "skip_count": 0.0, "step": 3314, "text_loss": 0.36633720993995667 @@ -31500,13 +31500,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03369140625, "learning_rate": 0.0008219203970445589, "loss": 0.011, "macro_f1": 0.3272727429866791, "num_tokens": 5348804.0, "repeat_count": 0.0, - "routers_loss": 0.009415820240974426, + "routers_loss": 0.009782899171113968, "skip_count": 1.0, "step": 3316, "text_loss": 0.3117460012435913 @@ -31519,13 +31519,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.055908203125, "learning_rate": 0.0008216835058813672, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 5351896.0, "repeat_count": 0.0, - "routers_loss": 0.006483082659542561, + "routers_loss": 0.007713229861110449, "skip_count": 0.0, "step": 3318, "text_loss": 0.253496378660202 @@ -31538,13 +31538,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.03173828125, "learning_rate": 0.0008214464914484492, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5355058.0, "repeat_count": 0.0, - "routers_loss": 0.006275791209191084, + "routers_loss": 0.006227815989404917, "skip_count": 2.0, "step": 3320, "text_loss": 0.32693132758140564 @@ -31557,13 +31557,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03271484375, "learning_rate": 0.0008212093538366292, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5358365.0, "repeat_count": 0.0, - "routers_loss": 0.0027182933408766985, + "routers_loss": 0.002601418411359191, "skip_count": 0.0, "step": 3322, "text_loss": 0.40394455194473267 @@ -31576,13 +31576,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.031982421875, "learning_rate": 0.000820972093136779, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5360981.0, "repeat_count": 0.0, - "routers_loss": 0.005600054748356342, + "routers_loss": 0.005545300897210836, "skip_count": 3.0, "step": 3324, "text_loss": 0.6758295893669128 @@ -31595,13 +31595,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05078125, "learning_rate": 0.0008207347094398172, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 5364018.0, "repeat_count": 1.0, - "routers_loss": 0.0020965971052646637, + "routers_loss": 0.001924700103700161, "skip_count": 0.0, "step": 3326, "text_loss": 0.5196860432624817 @@ -31614,13 +31614,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.0299072265625, "learning_rate": 0.0008204972028367097, - "loss": 0.006, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5366986.0, "repeat_count": 0.0, - "routers_loss": 0.011729889549314976, + "routers_loss": 0.012254828587174416, "skip_count": 1.0, "step": 3328, "text_loss": 0.24661913514137268 @@ -31633,13 +31633,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.038818359375, "learning_rate": 0.0008202595734184694, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5371463.0, "repeat_count": 0.0, - "routers_loss": 0.004913534037768841, + "routers_loss": 0.005094083491712809, "skip_count": 0.0, "step": 3330, "text_loss": 0.2525769770145416 @@ -31652,13 +31652,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.040283203125, "learning_rate": 0.0008200218212761566, - "loss": 0.0111, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 5374823.0, "repeat_count": 1.0, - "routers_loss": 0.0028079606126993895, + "routers_loss": 0.0025883198250085115, "skip_count": 0.0, "step": 3332, "text_loss": 0.21849912405014038 @@ -31671,13 +31671,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.030029296875, "learning_rate": 0.000819783946500878, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5377640.0, "repeat_count": 0.0, - "routers_loss": 0.008404970169067383, + "routers_loss": 0.008240507915616035, "skip_count": 0.0, "step": 3334, "text_loss": 0.2662734091281891 @@ -31690,13 +31690,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.048583984375, + "grad_norm": 0.050537109375, "learning_rate": 0.000819545949183788, - "loss": 0.0101, + "loss": 0.01, "macro_f1": 0.5934640765190125, "num_tokens": 5380593.0, "repeat_count": 0.0, - "routers_loss": 0.040179044008255005, + "routers_loss": 0.038378193974494934, "skip_count": 3.0, "step": 3336, "text_loss": 0.2431795746088028 @@ -31709,13 +31709,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.040283203125, "learning_rate": 0.0008193078294160874, - "loss": 0.0096, + "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 5384487.0, "repeat_count": 1.0, - "routers_loss": 0.005122583359479904, + "routers_loss": 0.005926199723035097, "skip_count": 1.0, "step": 3338, "text_loss": 0.5663705468177795 @@ -31728,13 +31728,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.032470703125, "learning_rate": 0.0008190695872890242, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5387511.0, "repeat_count": 0.0, - "routers_loss": 0.012232085689902306, + "routers_loss": 0.010842559859156609, "skip_count": 2.0, "step": 3340, "text_loss": 0.11517292261123657 @@ -31747,13 +31747,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029296875, + "grad_norm": 0.0283203125, "learning_rate": 0.0008188312228938933, - "loss": 0.009, + "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 5390698.0, "repeat_count": 0.0, - "routers_loss": 0.0011168667115271091, + "routers_loss": 0.001304097007960081, "skip_count": 0.0, "step": 3342, "text_loss": 0.4827076196670532 @@ -31766,13 +31766,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008185927363220363, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5393778.0, "repeat_count": 1.0, - "routers_loss": 0.005202370695769787, + "routers_loss": 0.005354117136448622, "skip_count": 0.0, "step": 3344, "text_loss": 0.44467049837112427 @@ -31785,13 +31785,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.040771484375, "learning_rate": 0.0008183541276648418, - "loss": 0.0081, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5396925.0, "repeat_count": 0.0, - "routers_loss": 0.005000839475542307, + "routers_loss": 0.004800073802471161, "skip_count": 2.0, "step": 3346, "text_loss": 0.2032834142446518 @@ -31804,13 +31804,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.027587890625, "learning_rate": 0.0008181153970137449, - "loss": 0.0059, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5400522.0, "repeat_count": 0.0, - "routers_loss": 0.0020684092305600643, + "routers_loss": 0.0021674633026123047, "skip_count": 0.0, "step": 3348, "text_loss": 0.4507528841495514 @@ -31823,13 +31823,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.051513671875, "learning_rate": 0.0008178765444602278, "loss": 0.0117, "macro_f1": 0.8820862174034119, "num_tokens": 5403526.0, "repeat_count": 2.0, - "routers_loss": 0.040753237903118134, + "routers_loss": 0.04263930395245552, "skip_count": 2.0, "step": 3350, "text_loss": 0.3606615960597992 @@ -31842,13 +31842,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.033447265625, "learning_rate": 0.0008176375700958194, - "loss": 0.0089, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5407127.0, "repeat_count": 1.0, - "routers_loss": 0.007767915725708008, + "routers_loss": 0.006953123956918716, "skip_count": 0.0, "step": 3352, "text_loss": 0.2290353775024414 @@ -31861,13 +31861,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.035400390625, "learning_rate": 0.0008173984740120948, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5410829.0, "repeat_count": 0.0, - "routers_loss": 0.0016073459992185235, + "routers_loss": 0.0014363783411681652, "skip_count": 0.0, "step": 3354, "text_loss": 0.4220392405986786 @@ -31880,13 +31880,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.031982421875, "learning_rate": 0.0008171592563006762, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 5414152.0, "repeat_count": 0.0, - "routers_loss": 0.0016132282325997949, + "routers_loss": 0.00202389364130795, "skip_count": 1.0, "step": 3356, "text_loss": 0.37729766964912415 @@ -31899,13 +31899,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.041015625, "learning_rate": 0.0008169199170532323, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5417312.0, "repeat_count": 0.0, - "routers_loss": 0.007077203597873449, + "routers_loss": 0.006253739818930626, "skip_count": 2.0, "step": 3358, "text_loss": 0.1304289996623993 @@ -31918,13 +31918,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.07568359375, + "grad_norm": 0.0703125, "learning_rate": 0.0008166804563614785, - "loss": 0.0088, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 5421227.0, "repeat_count": 2.0, - "routers_loss": 0.01628093235194683, + "routers_loss": 0.01622140221297741, "skip_count": 2.0, "step": 3360, "text_loss": 0.298664391040802 @@ -31937,13 +31937,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.024169921875, "learning_rate": 0.0008164408743171763, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5424646.0, "repeat_count": 1.0, - "routers_loss": 0.003795142285525799, + "routers_loss": 0.0037176944315433502, "skip_count": 2.0, "step": 3362, "text_loss": 0.12147632241249084 @@ -31956,13 +31956,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.046630859375, "learning_rate": 0.0008162011710121339, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5427897.0, "repeat_count": 0.0, - "routers_loss": 0.0024164009373635054, + "routers_loss": 0.0020403533708304167, "skip_count": 1.0, "step": 3364, "text_loss": 0.2656533420085907 @@ -31975,32 +31975,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.041748046875, "learning_rate": 0.0008159613465382066, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5430474.0, "repeat_count": 0.0, - "routers_loss": 0.002314126119017601, + "routers_loss": 0.0018634048756211996, "skip_count": 0.0, "step": 3366, "text_loss": 0.9133086204528809 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 15.812444966245964, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.058837890625, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, "learning_rate": 0.0008157214009872951, - "loss": 0.008, - "macro_f1": 0.5492662787437439, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, "num_tokens": 5433113.0, "repeat_count": 0.0, - "routers_loss": 0.014630996622145176, + "routers_loss": 0.012944488786160946, "skip_count": 2.0, "step": 3368, "text_loss": 0.24352453649044037 @@ -32013,13 +32013,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04833984375, + "grad_norm": 0.05712890625, "learning_rate": 0.0008154813344513472, - "loss": 0.0141, + "loss": 0.0143, "macro_f1": 0.6666666865348816, "num_tokens": 5436259.0, "repeat_count": 0.0, - "routers_loss": 0.0023453824687749147, + "routers_loss": 0.002347963862121105, "skip_count": 2.0, "step": 3370, "text_loss": 0.7601244449615479 @@ -32032,13 +32032,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.031494140625, "learning_rate": 0.0008152411470223568, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5439126.0, "repeat_count": 0.0, - "routers_loss": 0.0015595925506204367, + "routers_loss": 0.0016609140438959002, "skip_count": 0.0, "step": 3372, "text_loss": 0.5551947355270386 @@ -32051,13 +32051,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.04345703125, "learning_rate": 0.0008150008387923643, - "loss": 0.0067, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5442739.0, "repeat_count": 0.0, - "routers_loss": 0.008187411352992058, + "routers_loss": 0.008321396075189114, "skip_count": 0.0, "step": 3374, "text_loss": 0.25028282403945923 @@ -32070,13 +32070,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.10302734375, + "grad_norm": 0.08544921875, "learning_rate": 0.000814760409853456, - "loss": 0.0109, + "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 5445247.0, "repeat_count": 2.0, - "routers_loss": 0.009705786593258381, + "routers_loss": 0.009738070890307426, "skip_count": 1.0, "step": 3376, "text_loss": 0.37271201610565186 @@ -32089,13 +32089,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0439453125, + "grad_norm": 0.042236328125, "learning_rate": 0.0008145198602977651, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5449044.0, "repeat_count": 0.0, - "routers_loss": 0.003062802366912365, + "routers_loss": 0.0028421466704458, "skip_count": 0.0, "step": 3378, "text_loss": 0.1458655595779419 @@ -32108,13 +32108,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.095703125, + "grad_norm": 0.11474609375, "learning_rate": 0.0008142791902174701, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 5453063.0, "repeat_count": 0.0, - "routers_loss": 0.001539172139018774, + "routers_loss": 0.0015170135302469134, "skip_count": 0.0, "step": 3380, "text_loss": 0.5548722743988037 @@ -32127,13 +32127,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.031982421875, "learning_rate": 0.0008140383997047966, - "loss": 0.0082, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 5455814.0, "repeat_count": 0.0, - "routers_loss": 0.002227923832833767, + "routers_loss": 0.0022444510832428932, "skip_count": 1.0, "step": 3382, "text_loss": 0.8034513592720032 @@ -32146,13 +32146,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03369140625, "learning_rate": 0.000813797488852016, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5459392.0, "repeat_count": 0.0, - "routers_loss": 0.0003921810712199658, + "routers_loss": 0.00038578867679461837, "skip_count": 0.0, "step": 3384, "text_loss": 0.6940088868141174 @@ -32165,13 +32165,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.045654296875, "learning_rate": 0.0008135564577514458, - "loss": 0.0116, + "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5462413.0, "repeat_count": 0.0, - "routers_loss": 0.001971066929399967, + "routers_loss": 0.0019727381877601147, "skip_count": 0.0, "step": 3386, "text_loss": 0.5124650597572327 @@ -32184,13 +32184,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0869140625, + "grad_norm": 0.099609375, "learning_rate": 0.0008133153064954495, - "loss": 0.0108, + "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 5465552.0, "repeat_count": 0.0, - "routers_loss": 0.0018206594977527857, + "routers_loss": 0.0019896167796105146, "skip_count": 0.0, "step": 3388, "text_loss": 0.4292517900466919 @@ -32203,13 +32203,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0008130740351764367, - "loss": 0.0068, + "loss": 0.007, "macro_f1": 1.0, "num_tokens": 5468573.0, "repeat_count": 1.0, - "routers_loss": 0.003323496552184224, + "routers_loss": 0.0030118159484118223, "skip_count": 1.0, "step": 3390, "text_loss": 0.48903173208236694 @@ -32222,13 +32222,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0216064453125, "learning_rate": 0.000812832643886863, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5471547.0, "repeat_count": 0.0, - "routers_loss": 0.006201856769621372, + "routers_loss": 0.005084246397018433, "skip_count": 2.0, "step": 3392, "text_loss": 0.35789889097213745 @@ -32241,13 +32241,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.0390625, "learning_rate": 0.0008125911327192299, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5474331.0, "repeat_count": 0.0, - "routers_loss": 0.0009058464202098548, + "routers_loss": 0.0008874498889781535, "skip_count": 0.0, "step": 3394, "text_loss": 0.6267408728599548 @@ -32260,13 +32260,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03173828125, "learning_rate": 0.0008123495017660851, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5477633.0, "repeat_count": 0.0, - "routers_loss": 0.00202162005007267, + "routers_loss": 0.001794386887922883, "skip_count": 0.0, "step": 3396, "text_loss": 0.3701885938644409 @@ -32279,13 +32279,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04296875, + "grad_norm": 0.042724609375, "learning_rate": 0.0008121077511200221, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5481277.0, "repeat_count": 0.0, - "routers_loss": 0.0022049983963370323, + "routers_loss": 0.002140481723472476, "skip_count": 0.0, "step": 3398, "text_loss": 0.6362857818603516 @@ -32298,13 +32298,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05322265625, + "grad_norm": 0.0556640625, "learning_rate": 0.00081186588087368, - "loss": 0.0115, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 5484237.0, "repeat_count": 0.0, - "routers_loss": 0.0008255304419435561, + "routers_loss": 0.000867189432028681, "skip_count": 0.0, "step": 3400, "text_loss": 1.0847382545471191 @@ -32317,13 +32317,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0296630859375, "learning_rate": 0.0008116238911197442, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5487423.0, "repeat_count": 0.0, - "routers_loss": 0.0029532560147345066, + "routers_loss": 0.0029817656613886356, "skip_count": 0.0, "step": 3402, "text_loss": 0.3813740313053131 @@ -32336,13 +32336,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.049560546875, "learning_rate": 0.0008113817819509454, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5490155.0, "repeat_count": 0.0, - "routers_loss": 0.0038054194301366806, + "routers_loss": 0.0035141287371516228, "skip_count": 0.0, "step": 3404, "text_loss": 0.2113083451986313 @@ -32355,13 +32355,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.04443359375, "learning_rate": 0.0008111395534600603, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5493415.0, "repeat_count": 0.0, - "routers_loss": 0.0034561967477202415, + "routers_loss": 0.003317659953609109, "skip_count": 0.0, "step": 3406, "text_loss": 0.5869330167770386 @@ -32374,13 +32374,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.052001953125, "learning_rate": 0.0008108972057399114, - "loss": 0.0131, + "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 5496032.0, "repeat_count": 0.0, - "routers_loss": 0.0036799898371100426, + "routers_loss": 0.003833734430372715, "skip_count": 2.0, "step": 3408, "text_loss": 0.2938928008079529 @@ -32393,13 +32393,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08740234375, + "grad_norm": 0.11328125, "learning_rate": 0.0008106547388833669, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5498890.0, "repeat_count": 0.0, - "routers_loss": 0.0026391225401312113, + "routers_loss": 0.002622978063300252, "skip_count": 1.0, "step": 3410, "text_loss": 0.3130980432033539 @@ -32412,13 +32412,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.03564453125, "learning_rate": 0.0008104121529833402, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5502010.0, "repeat_count": 1.0, - "routers_loss": 0.00991886481642723, + "routers_loss": 0.007447598036378622, "skip_count": 0.0, "step": 3412, "text_loss": 0.4413072466850281 @@ -32431,13 +32431,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.03076171875, "learning_rate": 0.000810169448132791, - "loss": 0.0096, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5505212.0, "repeat_count": 0.0, - "routers_loss": 0.0031243201810866594, + "routers_loss": 0.0031087708193808794, "skip_count": 1.0, "step": 3414, "text_loss": 0.2910428047180176 @@ -32450,13 +32450,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.04345703125, "learning_rate": 0.0008099266244247243, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5508755.0, "repeat_count": 0.0, - "routers_loss": 0.02572118304669857, + "routers_loss": 0.02510393038392067, "skip_count": 1.0, "step": 3416, "text_loss": 0.33022749423980713 @@ -32469,13 +32469,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.03662109375, "learning_rate": 0.0008096836819521903, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5512034.0, "repeat_count": 0.0, - "routers_loss": 0.001839894917793572, + "routers_loss": 0.0020537273958325386, "skip_count": 1.0, "step": 3418, "text_loss": 0.4731218218803406 @@ -32488,32 +32488,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.0341796875, "learning_rate": 0.0008094406208082853, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5515707.0, "repeat_count": 0.0, - "routers_loss": 0.0039922320283949375, + "routers_loss": 0.004218162503093481, "skip_count": 2.0, "step": 3420, "text_loss": 0.23429590463638306 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 26.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, "epoch": 16.065746991488112, - "f1_execute": 1.0, + "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, - "f1_skip": 1.0, - "grad_norm": 0.0703125, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, "learning_rate": 0.0008091974410861507, - "loss": 0.0066, - "macro_f1": 1.0, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, "num_tokens": 5518436.0, "repeat_count": 1.0, - "routers_loss": 0.012939191423356533, + "routers_loss": 0.013488355092704296, "skip_count": 3.0, "step": 3422, "text_loss": 0.45768749713897705 @@ -32526,13 +32526,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03369140625, "learning_rate": 0.0008089541428789733, - "loss": 0.01, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 5522368.0, "repeat_count": 0.0, - "routers_loss": 0.001064157928340137, + "routers_loss": 0.0010335417464375496, "skip_count": 1.0, "step": 3424, "text_loss": 0.43423423171043396 @@ -32545,13 +32545,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0306396484375, "learning_rate": 0.0008087107262799855, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 5526061.0, "repeat_count": 0.0, - "routers_loss": 0.0024185231886804104, + "routers_loss": 0.002134323585778475, "skip_count": 0.0, "step": 3426, "text_loss": 0.4031757414340973 @@ -32564,13 +32564,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08203125, + "grad_norm": 0.1318359375, "learning_rate": 0.0008084671913824651, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5529284.0, "repeat_count": 0.0, - "routers_loss": 0.009645994752645493, + "routers_loss": 0.0097216060385108, "skip_count": 2.0, "step": 3428, "text_loss": 0.2836039960384369 @@ -32583,13 +32583,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0220947265625, "learning_rate": 0.000808223538279735, - "loss": 0.0051, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5532159.0, "repeat_count": 0.0, - "routers_loss": 0.0017972104251384735, + "routers_loss": 0.001684269867837429, "skip_count": 0.0, "step": 3430, "text_loss": 0.5804527401924133 @@ -32602,13 +32602,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04248046875, + "grad_norm": 0.0390625, "learning_rate": 0.0008079797670651637, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 5536050.0, "repeat_count": 1.0, - "routers_loss": 0.015138664282858372, + "routers_loss": 0.013918434269726276, "skip_count": 1.0, "step": 3432, "text_loss": 0.31325826048851013 @@ -32621,13 +32621,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042724609375, + "grad_norm": 0.0400390625, "learning_rate": 0.0008077358778321647, - "loss": 0.0114, + "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 5538885.0, "repeat_count": 0.0, - "routers_loss": 0.0007666898309253156, + "routers_loss": 0.0007751787197776139, "skip_count": 0.0, "step": 3434, "text_loss": 0.783108115196228 @@ -32640,13 +32640,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.033935546875, "learning_rate": 0.0008074918706741966, "loss": 0.0063, "macro_f1": 0.9262410998344421, "num_tokens": 5541909.0, "repeat_count": 3.0, - "routers_loss": 0.024132754653692245, + "routers_loss": 0.021819550544023514, "skip_count": 2.0, "step": 3436, "text_loss": 0.6558083295822144 @@ -32659,13 +32659,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.02880859375, "learning_rate": 0.0008072477456847638, - "loss": 0.0061, + "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 5545101.0, "repeat_count": 1.0, - "routers_loss": 0.03225114569067955, + "routers_loss": 0.03309348225593567, "skip_count": 0.0, "step": 3438, "text_loss": 0.9877075552940369 @@ -32678,13 +32678,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.04931640625, "learning_rate": 0.0008070035029574151, - "loss": 0.0062, + "loss": 0.006, "macro_f1": 1.0, "num_tokens": 5548971.0, "repeat_count": 1.0, - "routers_loss": 0.008569693192839622, + "routers_loss": 0.008696741424500942, "skip_count": 1.0, "step": 3440, "text_loss": 0.24766330420970917 @@ -32697,13 +32697,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033447265625, "learning_rate": 0.000806759142585745, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 5552174.0, "repeat_count": 0.0, - "routers_loss": 0.004438123665750027, + "routers_loss": 0.004240929149091244, "skip_count": 3.0, "step": 3442, "text_loss": 0.37255001068115234 @@ -32716,13 +32716,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0615234375, + "grad_norm": 0.05322265625, "learning_rate": 0.0008065146646633927, - "loss": 0.0091, + "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 5555005.0, "repeat_count": 0.0, - "routers_loss": 0.013728363439440727, + "routers_loss": 0.014345484785735607, "skip_count": 1.0, "step": 3444, "text_loss": 0.26157206296920776 @@ -32735,13 +32735,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.05810546875, + "grad_norm": 0.06005859375, "learning_rate": 0.0008062700692840428, "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5559127.0, "repeat_count": 1.0, - "routers_loss": 0.008383825421333313, + "routers_loss": 0.008315163664519787, "skip_count": 2.0, "step": 3446, "text_loss": 0.21971040964126587 @@ -32754,13 +32754,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.04443359375, + "grad_norm": 0.056396484375, "learning_rate": 0.0008060253565414246, "loss": 0.009, "macro_f1": 0.5934640765190125, "num_tokens": 5562254.0, "repeat_count": 0.0, - "routers_loss": 0.009948022663593292, + "routers_loss": 0.009582413360476494, "skip_count": 3.0, "step": 3448, "text_loss": 0.6758295893669128 @@ -32773,13 +32773,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.038818359375, "learning_rate": 0.0008057805265293124, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5565515.0, "repeat_count": 0.0, - "routers_loss": 0.0025822422467172146, + "routers_loss": 0.002429503947496414, "skip_count": 0.0, "step": 3450, "text_loss": 0.696592390537262 @@ -32792,13 +32792,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.041015625, "learning_rate": 0.0008055355793415257, - "loss": 0.0091, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5568392.0, "repeat_count": 0.0, - "routers_loss": 0.0008777108159847558, + "routers_loss": 0.0007724192109890282, "skip_count": 0.0, "step": 3452, "text_loss": 0.7092870473861694 @@ -32811,13 +32811,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.033447265625, "learning_rate": 0.0008052905150719285, - "loss": 0.01, + "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 5571090.0, "repeat_count": 0.0, - "routers_loss": 0.0009592860005795956, + "routers_loss": 0.0010859938338398933, "skip_count": 0.0, "step": 3454, "text_loss": 0.6593860387802124 @@ -32832,11 +32832,11 @@ "f1_skip": 1.0, "grad_norm": 0.04150390625, "learning_rate": 0.0008050453338144301, - "loss": 0.0077, + "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 5574552.0, "repeat_count": 1.0, - "routers_loss": 0.0029973683413118124, + "routers_loss": 0.0030258705373853445, "skip_count": 1.0, "step": 3456, "text_loss": 0.3479384481906891 @@ -32849,13 +32849,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.0380859375, "learning_rate": 0.0008048000356629844, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5577484.0, "repeat_count": 0.0, - "routers_loss": 0.005223365034908056, + "routers_loss": 0.005052885971963406, "skip_count": 2.0, "step": 3458, "text_loss": 0.21858671307563782 @@ -32868,13 +32868,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.029541015625, "learning_rate": 0.0008045546207115901, - "loss": 0.0074, + "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 5581605.0, "repeat_count": 1.0, - "routers_loss": 0.010660176165401936, + "routers_loss": 0.009976249188184738, "skip_count": 3.0, "step": 3460, "text_loss": 0.16868001222610474 @@ -32887,13 +32887,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.032958984375, "learning_rate": 0.0008043090890542904, - "loss": 0.008, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5584994.0, "repeat_count": 0.0, - "routers_loss": 0.003038279013708234, + "routers_loss": 0.00270817126147449, "skip_count": 0.0, "step": 3462, "text_loss": 0.785690426826477 @@ -32906,13 +32906,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.03173828125, "learning_rate": 0.0008040634407851739, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5588067.0, "repeat_count": 0.0, - "routers_loss": 0.001855011098086834, + "routers_loss": 0.0018436965765431523, "skip_count": 0.0, "step": 3464, "text_loss": 0.5006644129753113 @@ -32925,13 +32925,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.028076171875, "learning_rate": 0.0008038176759983731, - "loss": 0.0064, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5590789.0, "repeat_count": 0.0, - "routers_loss": 0.008276397362351418, + "routers_loss": 0.008516279980540276, "skip_count": 2.0, "step": 3466, "text_loss": 0.20963478088378906 @@ -32944,13 +32944,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.0361328125, "learning_rate": 0.0008035717947880659, - "loss": 0.0092, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 5593472.0, "repeat_count": 0.0, - "routers_loss": 0.0016371201490983367, + "routers_loss": 0.0016293043736368418, "skip_count": 0.0, "step": 3468, "text_loss": 0.7376078963279724 @@ -32963,13 +32963,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.036376953125, "learning_rate": 0.0008033257972484742, - "loss": 0.0081, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5596108.0, "repeat_count": 0.0, - "routers_loss": 0.002605364890769124, + "routers_loss": 0.002364142332226038, "skip_count": 0.0, "step": 3470, "text_loss": 0.5156455039978027 @@ -32982,13 +32982,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008030796834738649, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 5599103.0, "repeat_count": 0.0, - "routers_loss": 0.00892016664147377, + "routers_loss": 0.008872323669493198, "skip_count": 0.0, "step": 3472, "text_loss": 0.2996419668197632 @@ -33001,13 +33001,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.043701171875, "learning_rate": 0.0008028334535585491, - "loss": 0.0089, + "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 5602410.0, "repeat_count": 0.0, - "routers_loss": 0.01095602847635746, + "routers_loss": 0.011508257128298283, "skip_count": 3.0, "step": 3474, "text_loss": 0.25438693165779114 @@ -33020,13 +33020,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.038330078125, "learning_rate": 0.0008025871075968827, - "loss": 0.0105, + "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 5605424.0, "repeat_count": 2.0, - "routers_loss": 0.016052749007940292, + "routers_loss": 0.017225435003638268, "skip_count": 2.0, "step": 3476, "text_loss": 0.2549574077129364 @@ -33039,13 +33039,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.028564453125, "learning_rate": 0.0008023406456832657, - "loss": 0.0116, + "loss": 0.0111, "macro_f1": 0.9262410998344421, "num_tokens": 5608266.0, "repeat_count": 3.0, - "routers_loss": 0.04047509655356407, + "routers_loss": 0.039165645837783813, "skip_count": 2.0, "step": 3478, "text_loss": 0.1797947734594345 @@ -33058,13 +33058,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0272216796875, + "grad_norm": 0.026123046875, "learning_rate": 0.0008020940679121429, - "loss": 0.0073, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5611471.0, "repeat_count": 0.0, - "routers_loss": 0.0010115962941199541, + "routers_loss": 0.0009718866203911602, "skip_count": 0.0, "step": 3480, "text_loss": 0.8267702460289001 @@ -33077,13 +33077,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.037841796875, "learning_rate": 0.0008018473743780036, - "loss": 0.0095, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5615046.0, "repeat_count": 0.0, - "routers_loss": 0.006490753497928381, + "routers_loss": 0.006087122485041618, "skip_count": 2.0, "step": 3482, "text_loss": 0.7267677187919617 @@ -33096,13 +33096,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03369140625, "learning_rate": 0.000801600565175381, - "loss": 0.0088, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 5618350.0, "repeat_count": 0.0, - "routers_loss": 0.0008378152851946652, + "routers_loss": 0.0007539413054473698, "skip_count": 0.0, "step": 3484, "text_loss": 0.5910211801528931 @@ -33115,13 +33115,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.046142578125, "learning_rate": 0.0008013536403988529, - "loss": 0.0087, + "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5621381.0, "repeat_count": 0.0, - "routers_loss": 0.0007683819276280701, + "routers_loss": 0.0008076327503658831, "skip_count": 0.0, "step": 3486, "text_loss": 0.30616798996925354 @@ -33134,13 +33134,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.047607421875, + "grad_norm": 0.049072265625, "learning_rate": 0.0008011066001430412, "loss": 0.0086, "macro_f1": 0.6122449040412903, "num_tokens": 5624617.0, "repeat_count": 0.0, - "routers_loss": 0.02481125481426716, + "routers_loss": 0.023835813626646996, "skip_count": 4.0, "step": 3488, "text_loss": 0.3376443088054657 @@ -33153,13 +33153,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03271484375, "learning_rate": 0.0008008594445026122, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5627989.0, "repeat_count": 0.0, - "routers_loss": 0.005174005404114723, + "routers_loss": 0.004226419143378735, "skip_count": 2.0, "step": 3490, "text_loss": 0.8185343146324158 @@ -33172,13 +33172,13 @@ "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.044677734375, "learning_rate": 0.0008006121735722767, "loss": 0.0084, "macro_f1": 0.32098764181137085, "num_tokens": 5632286.0, "repeat_count": 0.0, - "routers_loss": 0.03602224588394165, + "routers_loss": 0.0366671048104763, "skip_count": 2.0, "step": 3492, "text_loss": 0.2209547609090805 @@ -33191,13 +33191,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.03466796875, "learning_rate": 0.0008003647874467892, - "loss": 0.0087, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 5635368.0, "repeat_count": 1.0, - "routers_loss": 0.012145630083978176, + "routers_loss": 0.012956378981471062, "skip_count": 0.0, "step": 3494, "text_loss": 0.20468664169311523 @@ -33210,13 +33210,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.059814453125, "learning_rate": 0.0008001172862209485, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 5638440.0, "repeat_count": 1.0, - "routers_loss": 0.001456267898902297, + "routers_loss": 0.0017375422175973654, "skip_count": 0.0, "step": 3496, "text_loss": 0.6647221446037292 @@ -33229,13 +33229,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.0244140625, "learning_rate": 0.0007998696699895976, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 0.6592592597007751, "num_tokens": 5641996.0, "repeat_count": 1.0, - "routers_loss": 0.028984347358345985, + "routers_loss": 0.025240756571292877, "skip_count": 5.0, "step": 3498, "text_loss": 0.23892143368721008 @@ -33248,13 +33248,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.021728515625, "learning_rate": 0.0007996219388476236, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5645071.0, "repeat_count": 0.0, - "routers_loss": 0.006859986111521721, + "routers_loss": 0.007436830550432205, "skip_count": 1.0, "step": 3500, "text_loss": 0.7580804228782654 @@ -33267,13 +33267,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0242919921875, "learning_rate": 0.0007993740928899571, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5648175.0, "repeat_count": 0.0, - "routers_loss": 0.0011989293852820992, + "routers_loss": 0.001126602990552783, "skip_count": 0.0, "step": 3502, "text_loss": 0.5281378626823425 @@ -33286,13 +33286,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.04443359375, "learning_rate": 0.0007991261322115737, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5650973.0, "repeat_count": 0.0, - "routers_loss": 0.0007974735926836729, + "routers_loss": 0.0007907263352535665, "skip_count": 0.0, "step": 3504, "text_loss": 0.25220927596092224 @@ -33305,13 +33305,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0262451171875, "learning_rate": 0.000798878056907492, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5654252.0, "repeat_count": 2.0, - "routers_loss": 0.007121780421584845, + "routers_loss": 0.006263538729399443, "skip_count": 2.0, "step": 3506, "text_loss": 0.46569153666496277 @@ -33324,13 +33324,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.0703125, "learning_rate": 0.0007986298670727752, - "loss": 0.0101, + "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 5657229.0, "repeat_count": 0.0, - "routers_loss": 0.00414140522480011, + "routers_loss": 0.004049144219607115, "skip_count": 3.0, "step": 3508, "text_loss": 0.15174436569213867 @@ -33343,13 +33343,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.058837890625, + "grad_norm": 0.0791015625, "learning_rate": 0.0007983815628025301, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.9262410998344421, "num_tokens": 5659974.0, "repeat_count": 2.0, - "routers_loss": 0.04618353769183159, + "routers_loss": 0.0471976138651371, "skip_count": 3.0, "step": 3510, "text_loss": 0.39072203636169434 @@ -33362,13 +33362,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.03369140625, "learning_rate": 0.000798133144191907, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3272727429866791, "num_tokens": 5662893.0, "repeat_count": 0.0, - "routers_loss": 0.04054548963904381, + "routers_loss": 0.04030488431453705, "skip_count": 1.0, "step": 3512, "text_loss": 0.3562147617340088 @@ -33381,13 +33381,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.0595703125, "learning_rate": 0.0007978846113361009, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5666476.0, "repeat_count": 0.0, - "routers_loss": 0.007785080466419458, + "routers_loss": 0.007475079502910376, "skip_count": 1.0, "step": 3514, "text_loss": 0.26518192887306213 @@ -33400,13 +33400,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.044189453125, "learning_rate": 0.0007976359643303497, - "loss": 0.0128, + "loss": 0.013, "macro_f1": 0.6666666865348816, "num_tokens": 5669647.0, "repeat_count": 0.0, - "routers_loss": 0.0057366108521819115, + "routers_loss": 0.00558585487306118, "skip_count": 2.0, "step": 3516, "text_loss": 0.29284560680389404 @@ -33419,13 +33419,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.0361328125, "learning_rate": 0.0007973872032699354, - "loss": 0.0088, + "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 5673491.0, "repeat_count": 1.0, - "routers_loss": 0.002753519220277667, + "routers_loss": 0.0026981087867170572, "skip_count": 1.0, "step": 3518, "text_loss": 0.35089045763015747 @@ -33438,32 +33438,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.033203125, "learning_rate": 0.000797138328250184, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5676529.0, "repeat_count": 1.0, - "routers_loss": 0.0027982397004961967, + "routers_loss": 0.0027328627184033394, "skip_count": 0.0, "step": 3520, "text_loss": 0.41077399253845215 }, { "acc_repeat": 0.0, - "acc_skip": 0.800000011920929, - "avg_layers": 24.0, + "acc_skip": 1.0, + "avg_layers": 23.0, "epoch": 16.535368359260346, - "f1_execute": 0.95652174949646, + "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, - "f1_skip": 0.888888955116272, - "grad_norm": 0.055419921875, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, "learning_rate": 0.0007968893393664646, - "loss": 0.0105, - "macro_f1": 0.6151369214057922, + "loss": 0.01, + "macro_f1": 0.6592592597007751, "num_tokens": 5679987.0, "repeat_count": 1.0, - "routers_loss": 0.03294458985328674, + "routers_loss": 0.02695014327764511, "skip_count": 5.0, "step": 3522, "text_loss": 0.44942837953567505 @@ -33476,13 +33476,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.035400390625, "learning_rate": 0.0007966402367141903, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5683185.0, "repeat_count": 0.0, - "routers_loss": 0.007946476340293884, + "routers_loss": 0.00817026849836111, "skip_count": 2.0, "step": 3524, "text_loss": 0.14528048038482666 @@ -33495,13 +33495,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.021240234375, + "grad_norm": 0.0216064453125, "learning_rate": 0.0007963910203888176, - "loss": 0.0043, + "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 5686544.0, "repeat_count": 0.0, - "routers_loss": 0.0021326798014342785, + "routers_loss": 0.0021973433904349804, "skip_count": 0.0, "step": 3526, "text_loss": 0.22358648478984833 @@ -33514,13 +33514,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0556640625, + "grad_norm": 0.050048828125, "learning_rate": 0.0007961416904858469, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 0.3272727429866791, "num_tokens": 5689579.0, "repeat_count": 0.0, - "routers_loss": 0.03373958170413971, + "routers_loss": 0.033712416887283325, "skip_count": 1.0, "step": 3528, "text_loss": 0.3083649277687073 @@ -33533,13 +33533,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.0361328125, "learning_rate": 0.0007958922471008217, - "loss": 0.007, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5692869.0, "repeat_count": 0.0, - "routers_loss": 0.010963297449052334, + "routers_loss": 0.011182719841599464, "skip_count": 2.0, "step": 3530, "text_loss": 0.21288011968135834 @@ -33552,13 +33552,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0267333984375, "learning_rate": 0.0007956426903293292, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5696007.0, "repeat_count": 0.0, - "routers_loss": 0.0014243065379559994, + "routers_loss": 0.0015808293828740716, "skip_count": 0.0, "step": 3532, "text_loss": 0.6068631410598755 @@ -33571,13 +33571,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.052734375, "learning_rate": 0.0007953930202670001, - "loss": 0.0066, + "loss": 0.0062, "macro_f1": 0.5492662787437439, "num_tokens": 5699474.0, "repeat_count": 2.0, - "routers_loss": 0.038375116884708405, + "routers_loss": 0.03205178305506706, "skip_count": 0.0, "step": 3534, "text_loss": 0.4317135512828827 @@ -33590,13 +33590,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.064453125, "learning_rate": 0.0007951432370095084, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 5703483.0, "repeat_count": 0.0, - "routers_loss": 0.0041501945815980434, + "routers_loss": 0.003518853336572647, "skip_count": 0.0, "step": 3536, "text_loss": 0.5432273149490356 @@ -33609,13 +33609,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.08349609375, + "grad_norm": 0.11083984375, "learning_rate": 0.0007948933406525715, "loss": 0.01, "macro_f1": 1.0, "num_tokens": 5707301.0, "repeat_count": 1.0, - "routers_loss": 0.00536845438182354, + "routers_loss": 0.004982157610356808, "skip_count": 1.0, "step": 3538, "text_loss": 0.40061065554618835 @@ -33628,13 +33628,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.0751953125, "learning_rate": 0.0007946433312919502, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5710847.0, "repeat_count": 0.0, - "routers_loss": 0.0030090278014540672, + "routers_loss": 0.003067734418436885, "skip_count": 0.0, "step": 3540, "text_loss": 0.5396234393119812 @@ -33647,13 +33647,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.055419921875, + "grad_norm": 0.05224609375, "learning_rate": 0.0007943932090234486, - "loss": 0.0098, + "loss": 0.0097, "macro_f1": 0.5492662787437439, "num_tokens": 5713683.0, "repeat_count": 0.0, - "routers_loss": 0.03756432980298996, + "routers_loss": 0.03728383034467697, "skip_count": 2.0, "step": 3542, "text_loss": 0.18310914933681488 @@ -33666,13 +33666,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.03271484375, "learning_rate": 0.0007941429739429138, - "loss": 0.0037, + "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5716397.0, "repeat_count": 0.0, - "routers_loss": 0.002606320893391967, + "routers_loss": 0.0025092530995607376, "skip_count": 3.0, "step": 3544, "text_loss": 0.5806207060813904 @@ -33685,13 +33685,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.040283203125, "learning_rate": 0.0007938926261462366, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5719984.0, "repeat_count": 0.0, - "routers_loss": 0.0025650030001997948, + "routers_loss": 0.002493767999112606, "skip_count": 0.0, "step": 3546, "text_loss": 0.38606807589530945 @@ -33704,13 +33704,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.044677734375, + "grad_norm": 0.05078125, "learning_rate": 0.0007936421657293507, "loss": 0.0094, "macro_f1": 0.8823530077934265, "num_tokens": 5723571.0, "repeat_count": 1.0, - "routers_loss": 0.013521218672394753, + "routers_loss": 0.014810923486948013, "skip_count": 2.0, "step": 3548, "text_loss": 0.49558472633361816 @@ -33723,13 +33723,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.0284423828125, "learning_rate": 0.0007933915927882327, - "loss": 0.0071, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5726405.0, "repeat_count": 0.0, - "routers_loss": 0.0014581449795514345, + "routers_loss": 0.00152928801253438, "skip_count": 0.0, "step": 3550, "text_loss": 0.8674797415733337 @@ -33742,13 +33742,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.0390625, "learning_rate": 0.000793140907418903, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 5729955.0, "repeat_count": 0.0, - "routers_loss": 0.005775467026978731, + "routers_loss": 0.005522782914340496, "skip_count": 2.0, "step": 3552, "text_loss": 0.3274473249912262 @@ -33761,13 +33761,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.0322265625, "learning_rate": 0.0007928901097174248, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5733030.0, "repeat_count": 0.0, - "routers_loss": 0.008668854832649231, + "routers_loss": 0.009207013063132763, "skip_count": 2.0, "step": 3554, "text_loss": 0.18237128853797913 @@ -33780,13 +33780,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.056884765625, + "grad_norm": 0.0693359375, "learning_rate": 0.0007926391997799039, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5735978.0, "repeat_count": 0.0, - "routers_loss": 0.007210119627416134, + "routers_loss": 0.00695531303063035, "skip_count": 0.0, "step": 3556, "text_loss": 0.3266434967517853 @@ -33799,13 +33799,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048583984375, + "grad_norm": 0.05419921875, "learning_rate": 0.0007923881777024898, - "loss": 0.0065, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5738901.0, "repeat_count": 0.0, - "routers_loss": 0.00165808224119246, + "routers_loss": 0.002743212040513754, "skip_count": 1.0, "step": 3558, "text_loss": 0.4971913695335388 @@ -33818,13 +33818,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.049560546875, + "grad_norm": 0.04931640625, "learning_rate": 0.0007921370435813741, - "loss": 0.0081, + "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 5741946.0, "repeat_count": 1.0, - "routers_loss": 0.007618873380124569, + "routers_loss": 0.007037297356873751, "skip_count": 0.0, "step": 3560, "text_loss": 0.5645473599433899 @@ -33837,13 +33837,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047607421875, + "grad_norm": 0.05419921875, "learning_rate": 0.0007918857975127924, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5744987.0, "repeat_count": 0.0, - "routers_loss": 0.0031584161333739758, + "routers_loss": 0.0030746585689485073, "skip_count": 0.0, "step": 3562, "text_loss": 0.17717665433883667 @@ -33856,13 +33856,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.058349609375, "learning_rate": 0.0007916344395930224, - "loss": 0.0079, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 5747837.0, "repeat_count": 0.0, - "routers_loss": 0.005207436624914408, + "routers_loss": 0.004522138275206089, "skip_count": 0.0, "step": 3564, "text_loss": 0.7676118612289429 @@ -33875,13 +33875,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.036865234375, "learning_rate": 0.000791382969918385, - "loss": 0.0074, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5750716.0, "repeat_count": 0.0, - "routers_loss": 0.0023729163222014904, + "routers_loss": 0.0026240211445838213, "skip_count": 0.0, "step": 3566, "text_loss": 0.4975173771381378 @@ -33894,13 +33894,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.061767578125, + "grad_norm": 0.06396484375, "learning_rate": 0.000791131388585244, - "loss": 0.0115, + "loss": 0.011, "macro_f1": 0.8820862174034119, "num_tokens": 5754368.0, "repeat_count": 2.0, - "routers_loss": 0.021537931635975838, + "routers_loss": 0.021831991150975227, "skip_count": 2.0, "step": 3568, "text_loss": 0.9670342206954956 @@ -33913,13 +33913,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.03369140625, "learning_rate": 0.0007908796956900055, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5757076.0, "repeat_count": 1.0, - "routers_loss": 0.001752255018800497, + "routers_loss": 0.0017586691537871957, "skip_count": 0.0, "step": 3570, "text_loss": 0.3057977259159088 @@ -33932,13 +33932,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.05224609375, "learning_rate": 0.000790627891329119, - "loss": 0.006, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5760613.0, "repeat_count": 0.0, - "routers_loss": 0.00557586969807744, + "routers_loss": 0.005515786819159985, "skip_count": 0.0, "step": 3572, "text_loss": 0.5860086679458618 @@ -33951,13 +33951,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.04296875, "learning_rate": 0.0007903759755990763, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5763557.0, "repeat_count": 0.0, - "routers_loss": 0.004236271139234304, + "routers_loss": 0.004096484277397394, "skip_count": 0.0, "step": 3574, "text_loss": 0.17175781726837158 @@ -33970,13 +33970,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.04541015625, "learning_rate": 0.000790123948596412, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 5767430.0, "repeat_count": 1.0, - "routers_loss": 0.003505093976855278, + "routers_loss": 0.005216122139245272, "skip_count": 0.0, "step": 3576, "text_loss": 0.7520374059677124 @@ -33989,13 +33989,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06640625, + "grad_norm": 0.07177734375, "learning_rate": 0.0007898718104177031, - "loss": 0.011, + "loss": 0.0108, "macro_f1": 0.3333333432674408, "num_tokens": 5770175.0, "repeat_count": 0.0, - "routers_loss": 0.0039036881644278765, + "routers_loss": 0.0037980107590556145, "skip_count": 0.0, "step": 3578, "text_loss": 0.18117885291576385 @@ -34008,13 +34008,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04541015625, "learning_rate": 0.0007896195611595699, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5773032.0, "repeat_count": 0.0, - "routers_loss": 0.00450134975835681, + "routers_loss": 0.003672175807878375, "skip_count": 2.0, "step": 3580, "text_loss": 0.7241058349609375 @@ -34027,13 +34027,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.0615234375, "learning_rate": 0.0007893672009186744, - "loss": 0.0082, + "loss": 0.0083, "macro_f1": 1.0, "num_tokens": 5776077.0, "repeat_count": 1.0, - "routers_loss": 0.01287894882261753, + "routers_loss": 0.01229850109666586, "skip_count": 3.0, "step": 3582, "text_loss": 0.29140418767929077 @@ -34046,13 +34046,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.03271484375, "learning_rate": 0.0007891147297917216, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5779088.0, "repeat_count": 1.0, - "routers_loss": 0.003500303253531456, + "routers_loss": 0.0035251814406365156, "skip_count": 0.0, "step": 3584, "text_loss": 0.1727485954761505 @@ -34065,13 +34065,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05615234375, + "grad_norm": 0.055908203125, "learning_rate": 0.000788862147875459, - "loss": 0.0093, + "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 5782201.0, "repeat_count": 0.0, - "routers_loss": 0.0042770374566316605, + "routers_loss": 0.004725661128759384, "skip_count": 2.0, "step": 3586, "text_loss": 0.43512848019599915 @@ -34084,13 +34084,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.057861328125, + "grad_norm": 0.06396484375, "learning_rate": 0.0007886094552666765, - "loss": 0.0107, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5785039.0, "repeat_count": 0.0, - "routers_loss": 0.005349197890609503, + "routers_loss": 0.005632172804325819, "skip_count": 0.0, "step": 3588, "text_loss": 0.3534786105155945 @@ -34103,13 +34103,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0556640625, "learning_rate": 0.0007883566520622062, - "loss": 0.0114, + "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 5788017.0, "repeat_count": 0.0, - "routers_loss": 0.008142824284732342, + "routers_loss": 0.006249965168535709, "skip_count": 1.0, "step": 3590, "text_loss": 0.2089710384607315 @@ -34122,13 +34122,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.02978515625, "learning_rate": 0.0007881037383589229, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 5791168.0, "repeat_count": 0.0, - "routers_loss": 0.0013415004359558225, + "routers_loss": 0.0013797614956274629, "skip_count": 0.0, "step": 3592, "text_loss": 0.4349329471588135 @@ -34141,13 +34141,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07666015625, + "grad_norm": 0.06982421875, "learning_rate": 0.0007878507142537436, - "loss": 0.0089, + "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 5793927.0, "repeat_count": 0.0, - "routers_loss": 0.0022349755745381117, + "routers_loss": 0.0019719740375876427, "skip_count": 1.0, "step": 3594, "text_loss": 0.6087368726730347 @@ -34160,13 +34160,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007875975798436274, - "loss": 0.0058, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5797214.0, "repeat_count": 1.0, - "routers_loss": 0.0037436108104884624, + "routers_loss": 0.0037070370744913816, "skip_count": 0.0, "step": 3596, "text_loss": 0.4258122444152832 @@ -34179,13 +34179,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.048583984375, "learning_rate": 0.0007873443352255764, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5800691.0, "repeat_count": 0.0, - "routers_loss": 0.008491694927215576, + "routers_loss": 0.008431311696767807, "skip_count": 0.0, "step": 3598, "text_loss": 0.6006711721420288 @@ -34198,13 +34198,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.055419921875, "learning_rate": 0.0007870909804966337, - "loss": 0.0075, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5804712.0, "repeat_count": 0.0, - "routers_loss": 0.0020895113702863455, + "routers_loss": 0.0017720256000757217, "skip_count": 0.0, "step": 3600, "text_loss": 0.6055042743682861 @@ -34217,13 +34217,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.053955078125, + "grad_norm": 0.0517578125, "learning_rate": 0.0007868375157538861, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.3272727429866791, "num_tokens": 5807670.0, "repeat_count": 1.0, - "routers_loss": 0.01193003449589014, + "routers_loss": 0.010697763413190842, "skip_count": 0.0, "step": 3602, "text_loss": 0.8039056658744812 @@ -34236,13 +34236,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.04150390625, "learning_rate": 0.0007865839410944611, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5810880.0, "repeat_count": 1.0, - "routers_loss": 0.003107197815552354, + "routers_loss": 0.0030022128485143185, "skip_count": 0.0, "step": 3604, "text_loss": 0.596110463142395 @@ -34255,13 +34255,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03173828125, "learning_rate": 0.0007863302566155295, - "loss": 0.0098, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 5814171.0, "repeat_count": 0.0, - "routers_loss": 0.0075443098321557045, + "routers_loss": 0.006257854867726564, "skip_count": 2.0, "step": 3606, "text_loss": 0.5700319409370422 @@ -34274,13 +34274,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02734375, + "grad_norm": 0.0294189453125, "learning_rate": 0.0007860764624143031, - "loss": 0.0053, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5817607.0, "repeat_count": 1.0, - "routers_loss": 0.005313992965966463, + "routers_loss": 0.004838473163545132, "skip_count": 0.0, "step": 3608, "text_loss": 0.8319530487060547 @@ -34293,13 +34293,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.09716796875, + "grad_norm": 0.08154296875, "learning_rate": 0.0007858225585880369, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.8823530077934265, "num_tokens": 5821452.0, "repeat_count": 1.0, - "routers_loss": 0.020901991054415703, + "routers_loss": 0.02173662930727005, "skip_count": 2.0, "step": 3610, "text_loss": 0.3738477826118469 @@ -34312,13 +34312,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.035400390625, "learning_rate": 0.0007855685452340269, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5824683.0, "repeat_count": 0.0, - "routers_loss": 0.002484811469912529, + "routers_loss": 0.0032719180453568697, "skip_count": 0.0, "step": 3612, "text_loss": 0.4054839015007019 @@ -34331,13 +34331,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.0380859375, "learning_rate": 0.0007853144224496118, - "loss": 0.0094, + "loss": 0.0093, "macro_f1": 0.3272727429866791, "num_tokens": 5827860.0, "repeat_count": 1.0, - "routers_loss": 0.032128892838954926, + "routers_loss": 0.032171256840229034, "skip_count": 0.0, "step": 3614, "text_loss": 0.18112395703792572 @@ -34350,13 +34350,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05322265625, + "grad_norm": 0.0458984375, "learning_rate": 0.0007850601903321716, - "loss": 0.0062, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5831651.0, "repeat_count": 0.0, - "routers_loss": 0.0136244622990489, + "routers_loss": 0.013230946846306324, "skip_count": 1.0, "step": 3616, "text_loss": 0.2698844075202942 @@ -34369,13 +34369,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.0361328125, "learning_rate": 0.000784805848979129, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5834369.0, "repeat_count": 0.0, - "routers_loss": 0.001705345930531621, + "routers_loss": 0.00162619655020535, "skip_count": 0.0, "step": 3618, "text_loss": 0.2430931180715561 @@ -34388,13 +34388,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0546875, + "grad_norm": 0.0498046875, "learning_rate": 0.0007845513984879477, - "loss": 0.0066, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5838102.0, "repeat_count": 1.0, - "routers_loss": 0.002594438148662448, + "routers_loss": 0.002781603019684553, "skip_count": 0.0, "step": 3620, "text_loss": 0.4968300759792328 @@ -34407,13 +34407,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.031005859375, "learning_rate": 0.0007842968389561337, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5841029.0, "repeat_count": 0.0, - "routers_loss": 0.0019142795354127884, + "routers_loss": 0.0023873315658420324, "skip_count": 0.0, "step": 3622, "text_loss": 0.5842974781990051 @@ -34426,13 +34426,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.03955078125, "learning_rate": 0.0007840421704812346, - "loss": 0.0093, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 5845158.0, "repeat_count": 0.0, - "routers_loss": 0.004223407246172428, + "routers_loss": 0.00400173757225275, "skip_count": 1.0, "step": 3624, "text_loss": 0.8312450647354126 @@ -34445,13 +34445,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.035888671875, "learning_rate": 0.00078378739316084, - "loss": 0.0092, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 5849175.0, "repeat_count": 0.0, - "routers_loss": 0.0005486982990987599, + "routers_loss": 0.0004974664188921452, "skip_count": 0.0, "step": 3626, "text_loss": 0.48637253046035767 @@ -34464,13 +34464,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.0654296875, + "grad_norm": 0.10693359375, "learning_rate": 0.000783532507092581, - "loss": 0.0077, + "loss": 0.0079, "macro_f1": 0.9555556178092957, "num_tokens": 5852020.0, "repeat_count": 1.0, - "routers_loss": 0.025490080937743187, + "routers_loss": 0.02555239573121071, "skip_count": 5.0, "step": 3628, "text_loss": 0.5407033562660217 @@ -34483,13 +34483,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.041259765625, "learning_rate": 0.0007832775123741306, - "loss": 0.0104, + "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 5854873.0, "repeat_count": 0.0, - "routers_loss": 0.0026199028361588717, + "routers_loss": 0.0025962977670133114, "skip_count": 0.0, "step": 3630, "text_loss": 0.618230938911438 @@ -34502,13 +34502,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0225830078125, + "grad_norm": 0.0234375, "learning_rate": 0.000783022409103203, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 5858086.0, "repeat_count": 0.0, - "routers_loss": 0.0028729604091495275, + "routers_loss": 0.0029271875973790884, "skip_count": 0.0, "step": 3632, "text_loss": 0.21259798109531403 @@ -34521,13 +34521,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.064453125, "learning_rate": 0.0007827671973775542, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5860886.0, "repeat_count": 0.0, - "routers_loss": 0.004097428172826767, + "routers_loss": 0.004102068953216076, "skip_count": 0.0, "step": 3634, "text_loss": 0.4991208016872406 @@ -34540,13 +34540,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.033203125, "learning_rate": 0.0007825118772949819, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 5864291.0, "repeat_count": 0.0, - "routers_loss": 0.002142589772120118, + "routers_loss": 0.0023497689981013536, "skip_count": 1.0, "step": 3636, "text_loss": 0.3878401517868042 @@ -34559,13 +34559,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0206298828125, + "grad_norm": 0.0216064453125, "learning_rate": 0.0007822564489533255, - "loss": 0.005, + "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5867155.0, "repeat_count": 0.0, - "routers_loss": 0.006497112102806568, + "routers_loss": 0.007680345326662064, "skip_count": 2.0, "step": 3638, "text_loss": 0.6132124066352844 @@ -34578,13 +34578,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.053466796875, "learning_rate": 0.0007820009124504653, - "loss": 0.0095, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5870325.0, "repeat_count": 0.0, - "routers_loss": 0.0008698388119228184, + "routers_loss": 0.0008242831099778414, "skip_count": 0.0, "step": 3640, "text_loss": 0.3552473187446594 @@ -34597,13 +34597,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.04296875, "learning_rate": 0.0007817452678843236, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.6601307392120361, "num_tokens": 5873301.0, "repeat_count": 1.0, - "routers_loss": 0.022245829924941063, + "routers_loss": 0.023831043392419815, "skip_count": 2.0, "step": 3642, "text_loss": 0.18363867700099945 @@ -34616,13 +34616,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0260009765625, "learning_rate": 0.0007814895153528635, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5876225.0, "repeat_count": 0.0, - "routers_loss": 0.0020051905885338783, + "routers_loss": 0.001999989850446582, "skip_count": 0.0, "step": 3644, "text_loss": 0.17581747472286224 @@ -34635,13 +34635,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.028564453125, "learning_rate": 0.0007812336549540903, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 5879501.0, "repeat_count": 0.0, - "routers_loss": 0.0014994015218690038, + "routers_loss": 0.001098626758903265, "skip_count": 0.0, "step": 3646, "text_loss": 0.5040884613990784 @@ -34654,13 +34654,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0294189453125, + "grad_norm": 0.03076171875, "learning_rate": 0.0007809776867860499, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3272727429866791, "num_tokens": 5882608.0, "repeat_count": 0.0, - "routers_loss": 0.010847748257219791, + "routers_loss": 0.012210183776915073, "skip_count": 1.0, "step": 3648, "text_loss": 0.27114811539649963 @@ -34673,13 +34673,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0262451171875, + "grad_norm": 0.032958984375, "learning_rate": 0.00078072161094683, - "loss": 0.006, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5886106.0, "repeat_count": 0.0, - "routers_loss": 0.005927151069045067, + "routers_loss": 0.005191771313548088, "skip_count": 2.0, "step": 3650, "text_loss": 0.5167917609214783 @@ -34692,13 +34692,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0235595703125, "learning_rate": 0.0007804654275345591, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5889122.0, "repeat_count": 0.0, - "routers_loss": 0.0019531139405444264, + "routers_loss": 0.0016411367105320096, "skip_count": 1.0, "step": 3652, "text_loss": 0.7691274285316467 @@ -34711,13 +34711,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.03515625, "learning_rate": 0.0007802091366474074, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.8823530077934265, "num_tokens": 5892313.0, "repeat_count": 2.0, - "routers_loss": 0.015216727741062641, + "routers_loss": 0.015627093613147736, "skip_count": 1.0, "step": 3654, "text_loss": 0.4646325409412384 @@ -34730,13 +34730,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.0341796875, "learning_rate": 0.0007799527383835858, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 5895577.0, "repeat_count": 0.0, - "routers_loss": 0.0009810501942411065, + "routers_loss": 0.0009879748104140162, "skip_count": 0.0, "step": 3656, "text_loss": 0.5587969422340393 @@ -34749,13 +34749,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.072265625, + "grad_norm": 0.0986328125, "learning_rate": 0.0007796962328413469, - "loss": 0.0093, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 5898546.0, "repeat_count": 0.0, - "routers_loss": 0.00458681071177125, + "routers_loss": 0.004864919930696487, "skip_count": 0.0, "step": 3658, "text_loss": 0.6981375813484192 @@ -34768,13 +34768,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.033447265625, "learning_rate": 0.0007794396201189839, - "loss": 0.0076, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 5901618.0, "repeat_count": 1.0, - "routers_loss": 0.006519644521176815, + "routers_loss": 0.006617432460188866, "skip_count": 2.0, "step": 3660, "text_loss": 0.22521957755088806 @@ -34787,13 +34787,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.036865234375, "learning_rate": 0.0007791829003148312, - "loss": 0.0097, + "loss": 0.0098, "macro_f1": 0.6601307392120361, "num_tokens": 5904540.0, "repeat_count": 1.0, - "routers_loss": 0.0783558189868927, + "routers_loss": 0.0782252699136734, "skip_count": 2.0, "step": 3662, "text_loss": 0.2649642825126648 @@ -34806,13 +34806,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06884765625, + "grad_norm": 0.06494140625, "learning_rate": 0.0007789260735272647, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 5907827.0, "repeat_count": 0.0, - "routers_loss": 0.0012588179670274258, + "routers_loss": 0.0012057392159476876, "skip_count": 0.0, "step": 3664, "text_loss": 0.6943771243095398 @@ -34825,13 +34825,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0177001953125, + "grad_norm": 0.018310546875, "learning_rate": 0.0007786691398547005, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5911163.0, "repeat_count": 0.0, - "routers_loss": 0.0075621698051691055, + "routers_loss": 0.007476957980543375, "skip_count": 2.0, "step": 3666, "text_loss": 0.1502683162689209 @@ -34844,13 +34844,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0303955078125, + "grad_norm": 0.0322265625, "learning_rate": 0.0007784120993955962, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5913948.0, "repeat_count": 1.0, - "routers_loss": 0.00408853217959404, + "routers_loss": 0.004082011990249157, "skip_count": 0.0, "step": 3668, "text_loss": 0.4127517640590668 @@ -34863,13 +34863,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.041259765625, "learning_rate": 0.0007781549522484503, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.9265305995941162, "num_tokens": 5917360.0, "repeat_count": 3.0, - "routers_loss": 0.02851647138595581, + "routers_loss": 0.027505695819854736, "skip_count": 1.0, "step": 3670, "text_loss": 0.23892618715763092 @@ -34882,13 +34882,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007778976985118018, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5920524.0, "repeat_count": 0.0, - "routers_loss": 0.0030399872921407223, + "routers_loss": 0.0024977331049740314, "skip_count": 2.0, "step": 3672, "text_loss": 0.5076471567153931 @@ -34901,13 +34901,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.0576171875, "learning_rate": 0.0007776403382842312, - "loss": 0.0061, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5923632.0, "repeat_count": 0.0, - "routers_loss": 0.0014176326803863049, + "routers_loss": 0.0015700991498306394, "skip_count": 0.0, "step": 3674, "text_loss": 0.6287924647331238 @@ -34920,13 +34920,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06591796875, + "grad_norm": 0.05810546875, "learning_rate": 0.0007773828716643591, - "loss": 0.0084, + "loss": 0.0085, "macro_f1": 0.3272727429866791, "num_tokens": 5926438.0, "repeat_count": 1.0, - "routers_loss": 0.0505419559776783, + "routers_loss": 0.05108916014432907, "skip_count": 0.0, "step": 3676, "text_loss": 0.26517006754875183 @@ -34939,13 +34939,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03857421875, "learning_rate": 0.0007771252987508474, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 5930081.0, "repeat_count": 0.0, - "routers_loss": 0.0034831957891583443, + "routers_loss": 0.003439917229115963, "skip_count": 0.0, "step": 3678, "text_loss": 0.5189079642295837 @@ -34958,13 +34958,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.056884765625, "learning_rate": 0.0007768676196423984, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5933463.0, "repeat_count": 1.0, - "routers_loss": 0.0020620382856577635, + "routers_loss": 0.001935846172273159, "skip_count": 1.0, "step": 3680, "text_loss": 0.6703575849533081 @@ -34972,18 +34972,18 @@ { "acc_repeat": 0.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 27.0, "epoch": 17.286469034341064, - "f1_execute": 0.9629629254341125, + "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007766098344377553, - "loss": 0.0084, - "macro_f1": 0.32098764181137085, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, "num_tokens": 5937098.0, "repeat_count": 0.0, - "routers_loss": 0.03850153833627701, + "routers_loss": 0.0384826585650444, "skip_count": 2.0, "step": 3682, "text_loss": 0.6424444913864136 @@ -34996,13 +34996,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.0301513671875, "learning_rate": 0.0007763519432357018, - "loss": 0.0065, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 5940436.0, "repeat_count": 0.0, - "routers_loss": 0.000853471748996526, + "routers_loss": 0.0008654671837575734, "skip_count": 0.0, "step": 3684, "text_loss": 0.4189988672733307 @@ -35015,13 +35015,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05712890625, + "grad_norm": 0.05908203125, "learning_rate": 0.0007760939461350623, - "loss": 0.0107, + "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 5943731.0, "repeat_count": 0.0, - "routers_loss": 0.007630084175616503, + "routers_loss": 0.007468715775758028, "skip_count": 2.0, "step": 3686, "text_loss": 0.2875453233718872 @@ -35036,11 +35036,11 @@ "f1_skip": 0.0, "grad_norm": 0.041259765625, "learning_rate": 0.0007758358432347019, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5946707.0, "repeat_count": 0.0, - "routers_loss": 0.001303135184571147, + "routers_loss": 0.001252831774763763, "skip_count": 0.0, "step": 3688, "text_loss": 0.5093055367469788 @@ -35053,13 +35053,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03271484375, "learning_rate": 0.0007755776346335259, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5949833.0, "repeat_count": 0.0, - "routers_loss": 0.001894078915938735, + "routers_loss": 0.001680848654359579, "skip_count": 0.0, "step": 3690, "text_loss": 0.4031114876270294 @@ -35072,13 +35072,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.0255126953125, "learning_rate": 0.0007753193204304807, - "loss": 0.0056, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5953095.0, "repeat_count": 0.0, - "routers_loss": 0.005708714015781879, + "routers_loss": 0.0047258250415325165, "skip_count": 2.0, "step": 3692, "text_loss": 0.17632785439491272 @@ -35091,13 +35091,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.036376953125, "learning_rate": 0.0007750609007245524, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5955971.0, "repeat_count": 2.0, - "routers_loss": 0.0019924843218177557, + "routers_loss": 0.001980359200388193, "skip_count": 4.0, "step": 3694, "text_loss": 0.3423727750778198 @@ -35110,13 +35110,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0255126953125, + "grad_norm": 0.0238037109375, "learning_rate": 0.0007748023756147679, - "loss": 0.007, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5958948.0, "repeat_count": 0.0, - "routers_loss": 0.005303190555423498, + "routers_loss": 0.00511702848598361, "skip_count": 0.0, "step": 3696, "text_loss": 0.28279972076416016 @@ -35129,13 +35129,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.03662109375, "learning_rate": 0.0007745437452001949, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5961819.0, "repeat_count": 0.0, - "routers_loss": 0.0004839526955038309, + "routers_loss": 0.0005220443126745522, "skip_count": 0.0, "step": 3698, "text_loss": 0.4793325662612915 @@ -35148,13 +35148,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.0400390625, "learning_rate": 0.0007742850095799408, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3272727429866791, "num_tokens": 5964625.0, "repeat_count": 1.0, - "routers_loss": 0.06377380341291428, + "routers_loss": 0.06411020457744598, "skip_count": 0.0, "step": 3700, "text_loss": 0.2825184464454651 @@ -35167,13 +35167,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0654296875, + "grad_norm": 0.0751953125, "learning_rate": 0.0007740261688531536, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 5967134.0, "repeat_count": 0.0, - "routers_loss": 0.00462002120912075, + "routers_loss": 0.004408109001815319, "skip_count": 3.0, "step": 3702, "text_loss": 0.690429151058197 @@ -35186,13 +35186,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.0279541015625, "learning_rate": 0.0007737672231190215, - "loss": 0.0033, + "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5969831.0, "repeat_count": 0.0, - "routers_loss": 0.0006775400252081454, + "routers_loss": 0.0006747521692886949, "skip_count": 0.0, "step": 3704, "text_loss": 0.32556024193763733 @@ -35205,13 +35205,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.031005859375, "learning_rate": 0.0007735081724767732, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5973015.0, "repeat_count": 0.0, - "routers_loss": 0.001372992410324514, + "routers_loss": 0.0020414739847183228, "skip_count": 0.0, "step": 3706, "text_loss": 0.5876469612121582 @@ -35224,13 +35224,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.072265625, "learning_rate": 0.0007732490170256769, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 5975778.0, "repeat_count": 1.0, - "routers_loss": 0.005310074891895056, + "routers_loss": 0.005610425490885973, "skip_count": 0.0, "step": 3708, "text_loss": 0.2968577444553375 @@ -35243,13 +35243,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05078125, + "grad_norm": 0.05419921875, "learning_rate": 0.0007729897568650422, - "loss": 0.01, + "loss": 0.0097, "macro_f1": 0.3333333432674408, "num_tokens": 5979115.0, "repeat_count": 0.0, - "routers_loss": 0.0012178041506558657, + "routers_loss": 0.001248046406544745, "skip_count": 0.0, "step": 3710, "text_loss": 0.626361608505249 @@ -35262,13 +35262,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0595703125, + "grad_norm": 0.06787109375, "learning_rate": 0.0007727303920942176, - "loss": 0.01, + "loss": 0.0102, "macro_f1": 0.6666666865348816, "num_tokens": 5982213.0, "repeat_count": 0.0, - "routers_loss": 0.004617640748620033, + "routers_loss": 0.005791695322841406, "skip_count": 2.0, "step": 3712, "text_loss": 0.4133484661579132 @@ -35281,13 +35281,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0791015625, + "grad_norm": 0.08740234375, "learning_rate": 0.0007724709228125922, - "loss": 0.0106, + "loss": 0.0105, "macro_f1": 0.5492662787437439, "num_tokens": 5984930.0, "repeat_count": 0.0, - "routers_loss": 0.020924020558595657, + "routers_loss": 0.02114664763212204, "skip_count": 2.0, "step": 3714, "text_loss": 0.4646461308002472 @@ -35300,13 +35300,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.032958984375, "learning_rate": 0.0007722113491195952, - "loss": 0.0059, + "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 5988017.0, "repeat_count": 2.0, - "routers_loss": 0.0053578754886984825, + "routers_loss": 0.005913930479437113, "skip_count": 5.0, "step": 3716, "text_loss": 0.15474505722522736 @@ -35319,13 +35319,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026123046875, + "grad_norm": 0.02685546875, "learning_rate": 0.0007719516711146957, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5991562.0, "repeat_count": 0.0, - "routers_loss": 0.006991801783442497, + "routers_loss": 0.0075925313867628574, "skip_count": 2.0, "step": 3718, "text_loss": 0.5293686985969543 @@ -35338,13 +35338,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.037353515625, "learning_rate": 0.000771691888897403, - "loss": 0.0054, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5994675.0, "repeat_count": 0.0, - "routers_loss": 0.0011527709430083632, + "routers_loss": 0.0012335237115621567, "skip_count": 0.0, "step": 3720, "text_loss": 0.5210637450218201 @@ -35357,13 +35357,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.09521484375, + "grad_norm": 0.0771484375, "learning_rate": 0.0007714320025672657, - "loss": 0.008, + "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 5999070.0, "repeat_count": 0.0, - "routers_loss": 0.011113573797047138, + "routers_loss": 0.010582062415778637, "skip_count": 2.0, "step": 3722, "text_loss": 0.2783571779727936 @@ -35376,13 +35376,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.032958984375, "learning_rate": 0.000771172012223873, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6598639488220215, "num_tokens": 6002702.0, "repeat_count": 1.0, - "routers_loss": 0.014584671705961227, + "routers_loss": 0.015008784830570221, "skip_count": 3.0, "step": 3724, "text_loss": 0.358705073595047 @@ -35395,13 +35395,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05419921875, + "grad_norm": 0.052734375, "learning_rate": 0.0007709119179668538, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6005517.0, "repeat_count": 0.0, - "routers_loss": 0.001164636923931539, + "routers_loss": 0.00111615180503577, "skip_count": 0.0, "step": 3726, "text_loss": 0.45202162861824036 @@ -35414,13 +35414,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.034912109375, "learning_rate": 0.0007706517198958764, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.6595745086669922, "num_tokens": 6009111.0, "repeat_count": 1.0, - "routers_loss": 0.05235295370221138, + "routers_loss": 0.05215252563357353, "skip_count": 4.0, "step": 3728, "text_loss": 0.20360413193702698 @@ -35433,13 +35433,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05859375, + "grad_norm": 0.053955078125, "learning_rate": 0.0007703914181106497, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6012989.0, "repeat_count": 0.0, - "routers_loss": 0.01087163109332323, + "routers_loss": 0.010039499960839748, "skip_count": 3.0, "step": 3730, "text_loss": 0.20334361493587494 @@ -35452,13 +35452,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.08203125, "learning_rate": 0.0007701310127109211, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6016420.0, "repeat_count": 0.0, - "routers_loss": 0.010110805742442608, + "routers_loss": 0.01090205181390047, "skip_count": 1.0, "step": 3732, "text_loss": 0.47959551215171814 @@ -35471,13 +35471,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, - "grad_norm": 0.03564453125, + "grad_norm": 0.0341796875, "learning_rate": 0.0007698705037964791, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6225374937057495, "num_tokens": 6019551.0, "repeat_count": 0.0, - "routers_loss": 0.026909299194812775, + "routers_loss": 0.02677762135863304, "skip_count": 5.0, "step": 3734, "text_loss": 0.2621438801288605 @@ -35490,13 +35490,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.056640625, "learning_rate": 0.000769609891467151, - "loss": 0.0122, + "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 6022262.0, "repeat_count": 1.0, - "routers_loss": 0.003602684009820223, + "routers_loss": 0.00460716662928462, "skip_count": 0.0, "step": 3736, "text_loss": 0.3433022201061249 @@ -35509,13 +35509,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.037109375, "learning_rate": 0.0007693491758228037, - "loss": 0.005, + "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6025723.0, "repeat_count": 0.0, - "routers_loss": 0.00290105608291924, + "routers_loss": 0.0036111194640398026, "skip_count": 2.0, "step": 3738, "text_loss": 0.38703784346580505 @@ -35528,13 +35528,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0007690883569633442, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6028652.0, "repeat_count": 0.0, - "routers_loss": 0.0031469720415771008, + "routers_loss": 0.003299296135082841, "skip_count": 0.0, "step": 3740, "text_loss": 0.24203069508075714 @@ -35547,13 +35547,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.0277099609375, "learning_rate": 0.0007688274349887188, - "loss": 0.0048, + "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6032280.0, "repeat_count": 0.0, - "routers_loss": 0.0029467069543898106, + "routers_loss": 0.003173880511894822, "skip_count": 0.0, "step": 3742, "text_loss": 0.2827291488647461 @@ -35566,13 +35566,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.0302734375, "learning_rate": 0.0007685664099989131, - "loss": 0.0074, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6035111.0, "repeat_count": 0.0, - "routers_loss": 0.0009511710377410054, + "routers_loss": 0.0008576177642680705, "skip_count": 0.0, "step": 3744, "text_loss": 0.43613526225090027 @@ -35585,13 +35585,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.0274658203125, "learning_rate": 0.0007683052820939524, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6038428.0, "repeat_count": 0.0, - "routers_loss": 0.004079817794263363, + "routers_loss": 0.004335585981607437, "skip_count": 2.0, "step": 3746, "text_loss": 1.0385624170303345 @@ -35604,13 +35604,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04052734375, "learning_rate": 0.0007680440513739015, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6041185.0, "repeat_count": 0.0, - "routers_loss": 0.0007996217464096844, + "routers_loss": 0.0008210531086660922, "skip_count": 0.0, "step": 3748, "text_loss": 0.7070431709289551 @@ -35623,13 +35623,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.056640625, "learning_rate": 0.0007677827179388646, - "loss": 0.0088, + "loss": 0.0089, "macro_f1": 1.0, "num_tokens": 6046333.0, "repeat_count": 1.0, - "routers_loss": 0.0047629233449697495, + "routers_loss": 0.003778942162171006, "skip_count": 1.0, "step": 3750, "text_loss": 0.3682238757610321 @@ -35642,13 +35642,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.05908203125, + "grad_norm": 0.08984375, "learning_rate": 0.000767521281888985, - "loss": 0.0087, + "loss": 0.009, "macro_f1": 1.0, "num_tokens": 6049528.0, "repeat_count": 1.0, - "routers_loss": 0.0039178295992314816, + "routers_loss": 0.002767334459349513, "skip_count": 1.0, "step": 3752, "text_loss": 0.7619418501853943 @@ -35661,13 +35661,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.041015625, "learning_rate": 0.0007672597433244455, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 6053202.0, "repeat_count": 0.0, - "routers_loss": 0.004995788913220167, + "routers_loss": 0.004796457476913929, "skip_count": 2.0, "step": 3754, "text_loss": 0.4157083034515381 @@ -35680,13 +35680,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.06689453125, "learning_rate": 0.0007669981023454682, - "loss": 0.0125, + "loss": 0.0126, "macro_f1": 0.3333333432674408, "num_tokens": 6056609.0, "repeat_count": 0.0, - "routers_loss": 0.0012595724547281861, + "routers_loss": 0.0013067846884950995, "skip_count": 0.0, "step": 3756, "text_loss": 0.4529118537902832 @@ -35699,13 +35699,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.033447265625, "learning_rate": 0.0007667363590523142, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6060504.0, "repeat_count": 0.0, - "routers_loss": 0.0012152433628216386, + "routers_loss": 0.0010285493917763233, "skip_count": 0.0, "step": 3758, "text_loss": 0.8363246321678162 @@ -35718,13 +35718,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.055419921875, "learning_rate": 0.0007664745135452844, - "loss": 0.0093, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6063526.0, "repeat_count": 0.0, - "routers_loss": 0.006478998344391584, + "routers_loss": 0.006289863493293524, "skip_count": 3.0, "step": 3760, "text_loss": 0.5313657522201538 @@ -35737,13 +35737,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.05517578125, "learning_rate": 0.0007662125659247183, - "loss": 0.0096, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6067147.0, "repeat_count": 0.0, - "routers_loss": 0.003008047351613641, + "routers_loss": 0.0028537956532090902, "skip_count": 0.0, "step": 3762, "text_loss": 0.5668109059333801 @@ -35756,13 +35756,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.039794921875, "learning_rate": 0.0007659505162909949, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6070350.0, "repeat_count": 0.0, - "routers_loss": 0.002841299632564187, + "routers_loss": 0.0026814753655344248, "skip_count": 0.0, "step": 3764, "text_loss": 0.4983512759208679 @@ -35775,13 +35775,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.055419921875, + "grad_norm": 0.056884765625, "learning_rate": 0.0007656883647445318, - "loss": 0.01, + "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 6073091.0, "repeat_count": 0.0, - "routers_loss": 0.006070348434150219, + "routers_loss": 0.005981382913887501, "skip_count": 1.0, "step": 3766, "text_loss": 0.30372318625450134 @@ -35794,13 +35794,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.028564453125, "learning_rate": 0.0007654261113857863, - "loss": 0.0073, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6076244.0, "repeat_count": 0.0, - "routers_loss": 0.0008278369787149131, + "routers_loss": 0.000803640519734472, "skip_count": 0.0, "step": 3768, "text_loss": 0.6100738048553467 @@ -35813,13 +35813,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02392578125, + "grad_norm": 0.027587890625, "learning_rate": 0.0007651637563152539, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6078936.0, "repeat_count": 0.0, - "routers_loss": 0.001354316365905106, + "routers_loss": 0.0013324898900464177, "skip_count": 0.0, "step": 3770, "text_loss": 0.4733821153640747 @@ -35832,13 +35832,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.029541015625, "learning_rate": 0.0007649012996334701, - "loss": 0.0051, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6081951.0, "repeat_count": 1.0, - "routers_loss": 0.0019684957806020975, + "routers_loss": 0.0021543330512940884, "skip_count": 0.0, "step": 3772, "text_loss": 0.6794875860214233 @@ -35851,13 +35851,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.04541015625, "learning_rate": 0.0007646387414410085, - "loss": 0.0076, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 6085165.0, "repeat_count": 0.0, - "routers_loss": 0.0005270782858133316, + "routers_loss": 0.0005426189745776355, "skip_count": 0.0, "step": 3774, "text_loss": 0.5886107683181763 @@ -35870,13 +35870,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.0262451171875, "learning_rate": 0.0007643760818384819, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6088370.0, "repeat_count": 0.0, - "routers_loss": 0.0029050554148852825, + "routers_loss": 0.002537576947361231, "skip_count": 0.0, "step": 3776, "text_loss": 0.23591920733451843 @@ -35889,13 +35889,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.03564453125, "learning_rate": 0.0007641133209265423, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6092319.0, "repeat_count": 0.0, - "routers_loss": 0.0026071348693221807, + "routers_loss": 0.002613696036860347, "skip_count": 0.0, "step": 3778, "text_loss": 0.3217754662036896 @@ -35908,13 +35908,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.052978515625, "learning_rate": 0.0007638504588058796, - "loss": 0.0101, + "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 6095799.0, "repeat_count": 0.0, - "routers_loss": 0.0008351493743248284, + "routers_loss": 0.0007219464750960469, "skip_count": 0.0, "step": 3780, "text_loss": 0.4276983141899109 @@ -35927,13 +35927,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0263671875, "learning_rate": 0.0007635874955772234, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6098789.0, "repeat_count": 0.0, - "routers_loss": 0.005872148554772139, + "routers_loss": 0.005965052172541618, "skip_count": 3.0, "step": 3782, "text_loss": 0.30936646461486816 @@ -35946,13 +35946,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0703125, + "grad_norm": 0.07177734375, "learning_rate": 0.0007633244313413417, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6101631.0, "repeat_count": 0.0, - "routers_loss": 0.0007862916099838912, + "routers_loss": 0.0007469559786841273, "skip_count": 0.0, "step": 3784, "text_loss": 0.44460123777389526 @@ -35965,13 +35965,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0478515625, + "grad_norm": 0.045654296875, "learning_rate": 0.0007630612661990412, - "loss": 0.0098, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 6105097.0, "repeat_count": 0.0, - "routers_loss": 0.0037640000227838755, + "routers_loss": 0.004300760570913553, "skip_count": 1.0, "step": 3786, "text_loss": 0.41950157284736633 @@ -35984,13 +35984,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03857421875, "learning_rate": 0.0007627980002511672, - "loss": 0.0068, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6107847.0, "repeat_count": 0.0, - "routers_loss": 0.0023107193410396576, + "routers_loss": 0.0023050960153341293, "skip_count": 1.0, "step": 3788, "text_loss": 0.48561373353004456 @@ -36003,13 +36003,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0322265625, "learning_rate": 0.0007625346335986039, - "loss": 0.0066, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6110546.0, "repeat_count": 0.0, - "routers_loss": 0.0017923865234479308, + "routers_loss": 0.0018124044872820377, "skip_count": 0.0, "step": 3790, "text_loss": 0.20882295072078705 @@ -36022,13 +36022,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0400390625, "learning_rate": 0.0007622711663422735, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6113600.0, "repeat_count": 0.0, - "routers_loss": 0.0007700122077949345, + "routers_loss": 0.0007613401976414025, "skip_count": 0.0, "step": 3792, "text_loss": 0.31751760840415955 @@ -36041,13 +36041,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04248046875, + "grad_norm": 0.0400390625, "learning_rate": 0.0007620075985831375, - "loss": 0.009, + "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 6116916.0, "repeat_count": 0.0, - "routers_loss": 0.004986821208149195, + "routers_loss": 0.005452962126582861, "skip_count": 2.0, "step": 3794, "text_loss": 0.3246645927429199 @@ -36060,13 +36060,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0272216796875, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007617439304221956, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6120056.0, "repeat_count": 2.0, - "routers_loss": 0.004177430644631386, + "routers_loss": 0.0043787881731987, "skip_count": 0.0, "step": 3796, "text_loss": 0.4859195947647095 @@ -36079,13 +36079,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.02294921875, "learning_rate": 0.0007614801619604856, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6122668.0, "repeat_count": 0.0, - "routers_loss": 0.003494138829410076, + "routers_loss": 0.0033891722559928894, "skip_count": 0.0, "step": 3798, "text_loss": 0.48194369673728943 @@ -36098,13 +36098,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.02587890625, "learning_rate": 0.0007612162932990845, - "loss": 0.0063, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6126792.0, "repeat_count": 0.0, - "routers_loss": 0.001831608940847218, + "routers_loss": 0.001883238204754889, "skip_count": 0.0, "step": 3800, "text_loss": 0.3740062117576599 @@ -36117,13 +36117,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03076171875, "learning_rate": 0.0007609523245391068, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 6129801.0, "repeat_count": 0.0, - "routers_loss": 0.010433467105031013, + "routers_loss": 0.00882677361369133, "skip_count": 2.0, "step": 3802, "text_loss": 0.5759486556053162 @@ -36136,13 +36136,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007606882557817062, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6133613.0, "repeat_count": 0.0, - "routers_loss": 0.009141471236944199, + "routers_loss": 0.009537030011415482, "skip_count": 2.0, "step": 3804, "text_loss": 0.3217554986476898 @@ -36155,13 +36155,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0235595703125, + "grad_norm": 0.0220947265625, "learning_rate": 0.0007604240871280742, - "loss": 0.0055, + "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6137784.0, "repeat_count": 0.0, - "routers_loss": 0.0024337477516382933, + "routers_loss": 0.0023913346230983734, "skip_count": 0.0, "step": 3806, "text_loss": 0.3718445599079132 @@ -36174,13 +36174,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.037841796875, "learning_rate": 0.0007601598186794407, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.6603773832321167, "num_tokens": 6141356.0, "repeat_count": 1.0, - "routers_loss": 0.03635421022772789, + "routers_loss": 0.033796411007642746, "skip_count": 1.0, "step": 3808, "text_loss": 0.2717749774456024 @@ -36193,13 +36193,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037109375, + "grad_norm": 0.037841796875, "learning_rate": 0.000759895450537074, - "loss": 0.0101, + "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 6144448.0, "repeat_count": 0.0, - "routers_loss": 0.002765925833955407, + "routers_loss": 0.0037919918540865183, "skip_count": 2.0, "step": 3810, "text_loss": 0.5935076475143433 @@ -36212,13 +36212,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.03271484375, "learning_rate": 0.0007596309828022803, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6147526.0, "repeat_count": 0.0, - "routers_loss": 0.0009747639996930957, + "routers_loss": 0.0008182782912626863, "skip_count": 0.0, "step": 3812, "text_loss": 0.449336439371109 @@ -36231,13 +36231,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.03125, "learning_rate": 0.0007593664155764044, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6150620.0, "repeat_count": 1.0, - "routers_loss": 0.001395601429976523, + "routers_loss": 0.001734903547912836, "skip_count": 0.0, "step": 3814, "text_loss": 0.6647221446037292 @@ -36250,13 +36250,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.037353515625, "learning_rate": 0.0007591017489608286, - "loss": 0.0092, + "loss": 0.0088, "macro_f1": 0.3272727429866791, "num_tokens": 6153714.0, "repeat_count": 1.0, - "routers_loss": 0.048050083220005035, + "routers_loss": 0.04721754416823387, "skip_count": 0.0, "step": 3816, "text_loss": 0.25481200218200684 @@ -36269,13 +36269,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007588369830569738, - "loss": 0.0062, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6156974.0, "repeat_count": 0.0, - "routers_loss": 0.00022119733330328017, + "routers_loss": 0.0002484306460246444, "skip_count": 0.0, "step": 3818, "text_loss": 0.7195295691490173 @@ -36288,13 +36288,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.031982421875, "learning_rate": 0.0007585721179662988, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6159660.0, "repeat_count": 0.0, - "routers_loss": 0.005448841955512762, + "routers_loss": 0.0051363613456487656, "skip_count": 2.0, "step": 3820, "text_loss": 0.5073586702346802 @@ -36307,13 +36307,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.052734375, "learning_rate": 0.0007583071537903005, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6163146.0, "repeat_count": 0.0, - "routers_loss": 0.007093957159668207, + "routers_loss": 0.006719176657497883, "skip_count": 0.0, "step": 3822, "text_loss": 0.6950558423995972 @@ -36326,13 +36326,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.0269775390625, "learning_rate": 0.0007580420906305136, - "loss": 0.007, + "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6166257.0, "repeat_count": 1.0, - "routers_loss": 0.008060536347329617, + "routers_loss": 0.00871267355978489, "skip_count": 3.0, "step": 3824, "text_loss": 0.2549148201942444 @@ -36345,13 +36345,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.022705078125, "learning_rate": 0.0007577769285885109, - "loss": 0.004, + "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6169624.0, "repeat_count": 0.0, - "routers_loss": 0.001302229124121368, + "routers_loss": 0.0015642556827515364, "skip_count": 0.0, "step": 3826, "text_loss": 0.3720305860042572 @@ -36364,13 +36364,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.039306640625, "learning_rate": 0.0007575116677659029, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6172673.0, "repeat_count": 0.0, - "routers_loss": 0.0010101167717948556, + "routers_loss": 0.0011551049537956715, "skip_count": 0.0, "step": 3828, "text_loss": 0.6819429397583008 @@ -36383,13 +36383,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.040771484375, "learning_rate": 0.0007572463082643377, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 6175414.0, "repeat_count": 0.0, - "routers_loss": 0.0009081853204406798, + "routers_loss": 0.0008922060951590538, "skip_count": 0.0, "step": 3830, "text_loss": 0.5424665212631226 @@ -36402,13 +36402,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03515625, + "grad_norm": 0.0341796875, "learning_rate": 0.0007569808501855023, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6178701.0, "repeat_count": 0.0, - "routers_loss": 0.0040206871926784515, + "routers_loss": 0.004167596809566021, "skip_count": 1.0, "step": 3832, "text_loss": 0.4429764151573181 @@ -36421,13 +36421,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.04931640625, "learning_rate": 0.00075671529363112, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6183036.0, "repeat_count": 0.0, - "routers_loss": 0.0009683453245088458, + "routers_loss": 0.0008732969872653484, "skip_count": 0.0, "step": 3834, "text_loss": 0.8015334010124207 @@ -36440,13 +36440,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.03271484375, "learning_rate": 0.0007564496387029531, - "loss": 0.0056, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6186325.0, "repeat_count": 0.0, - "routers_loss": 0.0021183546632528305, + "routers_loss": 0.0021374202333390713, "skip_count": 1.0, "step": 3836, "text_loss": 0.4233771562576294 @@ -36459,13 +36459,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03369140625, "learning_rate": 0.000756183885502801, - "loss": 0.0059, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6189919.0, "repeat_count": 1.0, - "routers_loss": 0.0034987039398401976, + "routers_loss": 0.004017227329313755, "skip_count": 0.0, "step": 3838, "text_loss": 0.33691394329071045 @@ -36478,13 +36478,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.01953125, + "grad_norm": 0.018310546875, "learning_rate": 0.0007559180341325005, - "loss": 0.0048, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6193412.0, "repeat_count": 0.0, - "routers_loss": 0.001348655903711915, + "routers_loss": 0.0013120946241542697, "skip_count": 0.0, "step": 3840, "text_loss": 0.14970099925994873 @@ -36497,13 +36497,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.029541015625, + "grad_norm": 0.031982421875, "learning_rate": 0.0007556520846939265, "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 6196588.0, "repeat_count": 0.0, - "routers_loss": 0.011758741922676563, + "routers_loss": 0.011793316341936588, "skip_count": 2.0, "step": 3842, "text_loss": 0.2714047133922577 @@ -36516,13 +36516,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.031494140625, "learning_rate": 0.0007553860372889914, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6200841.0, "repeat_count": 1.0, - "routers_loss": 0.022454025223851204, + "routers_loss": 0.019968654960393906, "skip_count": 4.0, "step": 3844, "text_loss": 0.23680976033210754 @@ -36535,13 +36535,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.049560546875, + "grad_norm": 0.052490234375, "learning_rate": 0.0007551198920196452, "loss": 0.0079, "macro_f1": 0.5492662787437439, "num_tokens": 6203797.0, "repeat_count": 0.0, - "routers_loss": 0.012088865973055363, + "routers_loss": 0.013615630567073822, "skip_count": 2.0, "step": 3846, "text_loss": 0.25839608907699585 @@ -36554,13 +36554,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.0546875, "learning_rate": 0.000754853648987875, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6206790.0, "repeat_count": 0.0, - "routers_loss": 0.0025066444650292397, + "routers_loss": 0.002420815173536539, "skip_count": 1.0, "step": 3848, "text_loss": 0.5358025431632996 @@ -36573,13 +36573,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.033447265625, + "grad_norm": 0.032470703125, "learning_rate": 0.0007545873082957057, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.9265305995941162, "num_tokens": 6209791.0, "repeat_count": 1.0, - "routers_loss": 0.01811581663787365, + "routers_loss": 0.018236197531223297, "skip_count": 3.0, "step": 3850, "text_loss": 0.1463700383901596 @@ -36592,13 +36592,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.034423828125, "learning_rate": 0.0007543208700451998, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6212792.0, "repeat_count": 0.0, - "routers_loss": 0.005889591295272112, + "routers_loss": 0.006242573726922274, "skip_count": 3.0, "step": 3852, "text_loss": 0.9441591501235962 @@ -36611,13 +36611,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.031982421875, "learning_rate": 0.0007540543343384565, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.3272727429866791, "num_tokens": 6215747.0, "repeat_count": 0.0, - "routers_loss": 0.015324318781495094, + "routers_loss": 0.01451140083372593, "skip_count": 1.0, "step": 3854, "text_loss": 0.41610902547836304 @@ -36630,13 +36630,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007537877012776132, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6218593.0, "repeat_count": 0.0, - "routers_loss": 0.0003138817264698446, + "routers_loss": 0.00037674361374229193, "skip_count": 0.0, "step": 3856, "text_loss": 0.6048852205276489 @@ -36649,13 +36649,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0269775390625, + "grad_norm": 0.0255126953125, "learning_rate": 0.0007535209709648439, - "loss": 0.0044, + "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6221315.0, "repeat_count": 1.0, - "routers_loss": 0.006152884569019079, + "routers_loss": 0.005776284262537956, "skip_count": 3.0, "step": 3858, "text_loss": 0.35627537965774536 @@ -36668,13 +36668,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0007532541435023605, - "loss": 0.0048, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6225012.0, "repeat_count": 0.0, - "routers_loss": 0.0009145989897660911, + "routers_loss": 0.0009280376834794879, "skip_count": 0.0, "step": 3860, "text_loss": 0.6440183520317078 @@ -36687,13 +36687,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.0224609375, "learning_rate": 0.0007529872189924114, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6227650.0, "repeat_count": 0.0, - "routers_loss": 0.0010246031451970339, + "routers_loss": 0.0009876530384644866, "skip_count": 0.0, "step": 3862, "text_loss": 0.35507893562316895 @@ -36706,13 +36706,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.048828125, "learning_rate": 0.0007527201975372827, - "loss": 0.0046, + "loss": 0.0045, "macro_f1": 0.6603773832321167, "num_tokens": 6230557.0, "repeat_count": 1.0, - "routers_loss": 0.011913667432963848, + "routers_loss": 0.013780162669718266, "skip_count": 1.0, "step": 3864, "text_loss": 0.38958442211151123 @@ -36725,13 +36725,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.04638671875, "learning_rate": 0.0007524530792392977, - "loss": 0.0111, + "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 6233371.0, "repeat_count": 0.0, - "routers_loss": 0.0050127157010138035, + "routers_loss": 0.004849869292229414, "skip_count": 3.0, "step": 3866, "text_loss": 0.3826720714569092 @@ -36744,13 +36744,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.0191650390625, "learning_rate": 0.0007521858642008163, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 6236770.0, "repeat_count": 0.0, - "routers_loss": 0.008781078271567822, + "routers_loss": 0.008618295192718506, "skip_count": 1.0, "step": 3868, "text_loss": 0.3596078157424927 @@ -36763,13 +36763,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03076171875, + "grad_norm": 0.029052734375, "learning_rate": 0.0007519185525242363, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6239661.0, "repeat_count": 0.0, - "routers_loss": 0.0014061459805816412, + "routers_loss": 0.0013421972980722785, "skip_count": 0.0, "step": 3870, "text_loss": 0.5585550665855408 @@ -36782,13 +36782,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.026611328125, "learning_rate": 0.0007516511443119916, - "loss": 0.0056, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6242459.0, "repeat_count": 0.0, - "routers_loss": 0.0031452353578060865, + "routers_loss": 0.0038009448908269405, "skip_count": 1.0, "step": 3872, "text_loss": 0.4418395757675171 @@ -36801,13 +36801,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.031982421875, "learning_rate": 0.0007513836396665534, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6245489.0, "repeat_count": 1.0, - "routers_loss": 0.0028979210183024406, + "routers_loss": 0.002785376040264964, "skip_count": 2.0, "step": 3874, "text_loss": 0.551510751247406 @@ -36820,13 +36820,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.0234375, "learning_rate": 0.0007511160386904305, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6249014.0, "repeat_count": 0.0, - "routers_loss": 0.0021069799549877644, + "routers_loss": 0.0021424589212983847, "skip_count": 1.0, "step": 3876, "text_loss": 1.0502676963806152 @@ -36839,13 +36839,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.034423828125, "learning_rate": 0.0007508483414861679, - "loss": 0.0083, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6252357.0, "repeat_count": 0.0, - "routers_loss": 0.0073753902688622475, + "routers_loss": 0.0085759861394763, "skip_count": 1.0, "step": 3878, "text_loss": 0.49212515354156494 @@ -36858,13 +36858,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.0361328125, "learning_rate": 0.0007505805481563477, - "loss": 0.0094, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6254975.0, "repeat_count": 0.0, - "routers_loss": 0.0010532810119912028, + "routers_loss": 0.0010723904706537724, "skip_count": 0.0, "step": 3880, "text_loss": 0.7022985816001892 @@ -36877,13 +36877,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.05078125, "learning_rate": 0.0007503126588035887, - "loss": 0.0086, + "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6258001.0, "repeat_count": 1.0, - "routers_loss": 0.012617395259439945, + "routers_loss": 0.012809890322387218, "skip_count": 2.0, "step": 3882, "text_loss": 0.1829151213169098 @@ -36896,13 +36896,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.0439453125, "learning_rate": 0.0007500446735305466, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6261795.0, "repeat_count": 0.0, - "routers_loss": 0.002872605575248599, + "routers_loss": 0.0026790346018970013, "skip_count": 1.0, "step": 3884, "text_loss": 0.20436066389083862 @@ -36915,13 +36915,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.02978515625, + "grad_norm": 0.035888671875, "learning_rate": 0.000749776592439914, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 6265585.0, "repeat_count": 1.0, - "routers_loss": 0.0047233253717422485, + "routers_loss": 0.005243788007646799, "skip_count": 2.0, "step": 3886, "text_loss": 0.4479229748249054 @@ -36934,13 +36934,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.024658203125, "learning_rate": 0.00074950841563442, - "loss": 0.0052, + "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6269039.0, "repeat_count": 0.0, - "routers_loss": 0.007303252816200256, + "routers_loss": 0.007998534478247166, "skip_count": 1.0, "step": 3888, "text_loss": 0.2154676914215088 @@ -36953,13 +36953,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.0238037109375, "learning_rate": 0.0007492401432168303, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6272315.0, "repeat_count": 0.0, - "routers_loss": 0.005679785739630461, + "routers_loss": 0.004648822825402021, "skip_count": 1.0, "step": 3890, "text_loss": 0.3375042676925659 @@ -36972,13 +36972,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.045654296875, "learning_rate": 0.0007489717752899477, - "loss": 0.0097, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6275342.0, "repeat_count": 0.0, - "routers_loss": 0.013875136151909828, + "routers_loss": 0.012154200114309788, "skip_count": 1.0, "step": 3892, "text_loss": 0.1964082419872284 @@ -36991,13 +36991,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.0267333984375, "learning_rate": 0.000748703311956611, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6278700.0, "repeat_count": 1.0, - "routers_loss": 0.004874289035797119, + "routers_loss": 0.004610476549714804, "skip_count": 2.0, "step": 3894, "text_loss": 0.26545581221580505 @@ -37010,13 +37010,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06494140625, + "grad_norm": 0.06201171875, "learning_rate": 0.0007484347533196961, "loss": 0.0105, "macro_f1": 0.6666666865348816, "num_tokens": 6281864.0, "repeat_count": 0.0, - "routers_loss": 0.008282547816634178, + "routers_loss": 0.0075586591847240925, "skip_count": 2.0, "step": 3896, "text_loss": 0.3106999397277832 @@ -37029,13 +37029,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0208740234375, + "grad_norm": 0.02099609375, "learning_rate": 0.0007481660994821151, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6284676.0, "repeat_count": 0.0, - "routers_loss": 0.00792533066123724, + "routers_loss": 0.007845268584787846, "skip_count": 1.0, "step": 3898, "text_loss": 0.4094304144382477 @@ -37048,13 +37048,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04052734375, "learning_rate": 0.0007478973505468165, - "loss": 0.0086, + "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 6287470.0, "repeat_count": 1.0, - "routers_loss": 0.012142898514866829, + "routers_loss": 0.011116391979157925, "skip_count": 2.0, "step": 3900, "text_loss": 0.1838909536600113 @@ -37067,13 +37067,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.0361328125, "learning_rate": 0.0007476285066167857, - "loss": 0.0062, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6290432.0, "repeat_count": 1.0, - "routers_loss": 0.004634121898561716, + "routers_loss": 0.004599364474415779, "skip_count": 0.0, "step": 3902, "text_loss": 0.25872838497161865 @@ -37086,13 +37086,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.046142578125, "learning_rate": 0.0007473595677950439, "loss": 0.0109, "macro_f1": 0.6666666865348816, "num_tokens": 6293557.0, "repeat_count": 0.0, - "routers_loss": 0.001632143510505557, + "routers_loss": 0.0016367282951250672, "skip_count": 1.0, "step": 3904, "text_loss": 0.5272360444068909 @@ -37105,13 +37105,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.032470703125, "learning_rate": 0.0007470905341846492, - "loss": 0.0053, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6295979.0, "repeat_count": 0.0, - "routers_loss": 0.0004961033118888736, + "routers_loss": 0.0004760588926728815, "skip_count": 0.0, "step": 3906, "text_loss": 0.666959822177887 @@ -37124,13 +37124,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0007468214058886956, - "loss": 0.0074, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6299215.0, "repeat_count": 0.0, - "routers_loss": 0.0007425977964885533, + "routers_loss": 0.000524883100297302, "skip_count": 0.0, "step": 3908, "text_loss": 0.5144801139831543 @@ -37143,13 +37143,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.031982421875, "learning_rate": 0.0007465521830103137, - "loss": 0.0081, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6302320.0, "repeat_count": 0.0, - "routers_loss": 0.0015668199630454183, + "routers_loss": 0.0016085522947832942, "skip_count": 0.0, "step": 3910, "text_loss": 0.14342890679836273 @@ -37162,13 +37162,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.03857421875, "learning_rate": 0.0007462828656526702, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6305212.0, "repeat_count": 0.0, - "routers_loss": 0.003138904692605138, + "routers_loss": 0.002720315707847476, "skip_count": 2.0, "step": 3912, "text_loss": 0.31109121441841125 @@ -37181,13 +37181,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.052001953125, + "grad_norm": 0.06884765625, "learning_rate": 0.0007460134539189681, - "loss": 0.0117, + "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 6308964.0, "repeat_count": 0.0, - "routers_loss": 0.0012123063206672668, + "routers_loss": 0.0010418406454846263, "skip_count": 1.0, "step": 3914, "text_loss": 0.5662030577659607 @@ -37200,13 +37200,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.047119140625, + "grad_norm": 0.052001953125, "learning_rate": 0.0007457439479124459, "loss": 0.0134, "macro_f1": 0.3333333432674408, "num_tokens": 6313195.0, "repeat_count": 0.0, - "routers_loss": 0.0017939694225788116, + "routers_loss": 0.0020303844939917326, "skip_count": 0.0, "step": 3916, "text_loss": 0.6358339190483093 @@ -37219,13 +37219,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0289306640625, "learning_rate": 0.0007454743477363797, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6315949.0, "repeat_count": 0.0, - "routers_loss": 0.0006735047209076583, + "routers_loss": 0.0006592223653569818, "skip_count": 0.0, "step": 3918, "text_loss": 0.35648423433303833 @@ -37238,13 +37238,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0262451171875, "learning_rate": 0.0007452046534940803, - "loss": 0.0078, + "loss": 0.0075, "macro_f1": 0.6603773832321167, "num_tokens": 6319024.0, "repeat_count": 1.0, - "routers_loss": 0.025279851630330086, + "routers_loss": 0.024555351585149765, "skip_count": 1.0, "step": 3920, "text_loss": 0.21955153346061707 @@ -37257,13 +37257,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.035888671875, "learning_rate": 0.0007449348652888952, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6321633.0, "repeat_count": 0.0, - "routers_loss": 0.002887458074837923, + "routers_loss": 0.003606822807341814, "skip_count": 1.0, "step": 3922, "text_loss": 0.6079489588737488 @@ -37276,13 +37276,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.037841796875, "learning_rate": 0.0007446649832242075, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6325209.0, "repeat_count": 0.0, - "routers_loss": 0.0034941197372972965, + "routers_loss": 0.0035831446293741465, "skip_count": 1.0, "step": 3924, "text_loss": 0.2774808406829834 @@ -37295,13 +37295,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.0311279296875, "learning_rate": 0.0007443950074034368, - "loss": 0.0067, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6327822.0, "repeat_count": 0.0, - "routers_loss": 0.006862608715891838, + "routers_loss": 0.006809544749557972, "skip_count": 2.0, "step": 3926, "text_loss": 0.48236769437789917 @@ -37314,13 +37314,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.049072265625, "learning_rate": 0.0007441249379300381, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6601307392120361, "num_tokens": 6331662.0, "repeat_count": 1.0, - "routers_loss": 0.02176409214735031, + "routers_loss": 0.023832591250538826, "skip_count": 2.0, "step": 3928, "text_loss": 0.7287537455558777 @@ -37333,13 +37333,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.04296875, "learning_rate": 0.0007438547749075028, - "loss": 0.0064, + "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 6335801.0, "repeat_count": 1.0, - "routers_loss": 0.013603253290057182, + "routers_loss": 0.011755098588764668, "skip_count": 3.0, "step": 3930, "text_loss": 0.17253030836582184 @@ -37352,13 +37352,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.02685546875, "learning_rate": 0.0007435845184393577, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6338747.0, "repeat_count": 1.0, - "routers_loss": 0.006635789293795824, + "routers_loss": 0.005972472485154867, "skip_count": 0.0, "step": 3932, "text_loss": 0.6400216817855835 @@ -37371,13 +37371,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.033447265625, "learning_rate": 0.0007433141686291657, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6342772.0, "repeat_count": 0.0, - "routers_loss": 0.0032724342308938503, + "routers_loss": 0.0030393085908144712, "skip_count": 1.0, "step": 3934, "text_loss": 0.6865074038505554 @@ -37390,13 +37390,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0213623046875, + "grad_norm": 0.020263671875, "learning_rate": 0.0007430437255805252, - "loss": 0.007, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6345957.0, "repeat_count": 0.0, - "routers_loss": 0.0007380369352176785, + "routers_loss": 0.0006984061910770833, "skip_count": 0.0, "step": 3936, "text_loss": 0.40398702025413513 @@ -37409,13 +37409,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.078125, + "grad_norm": 0.07275390625, "learning_rate": 0.0007427731893970706, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6349162.0, "repeat_count": 1.0, - "routers_loss": 0.004635625518858433, + "routers_loss": 0.005219762213528156, "skip_count": 0.0, "step": 3938, "text_loss": 0.5951031446456909 @@ -37428,13 +37428,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.04541015625, "learning_rate": 0.0007425025601824717, - "loss": 0.0085, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 6352655.0, "repeat_count": 0.0, - "routers_loss": 0.014994140714406967, + "routers_loss": 0.015575960278511047, "skip_count": 3.0, "step": 3940, "text_loss": 0.26689088344573975 @@ -37447,13 +37447,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.03662109375, "learning_rate": 0.0007422318380404346, - "loss": 0.0067, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6355890.0, "repeat_count": 0.0, - "routers_loss": 0.0011694672284647822, + "routers_loss": 0.0012208883417770267, "skip_count": 0.0, "step": 3942, "text_loss": 0.570725679397583 @@ -37466,13 +37466,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0235595703125, "learning_rate": 0.0007419610230746999, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6358891.0, "repeat_count": 1.0, - "routers_loss": 0.003442608518525958, + "routers_loss": 0.0029412026051431894, "skip_count": 0.0, "step": 3944, "text_loss": 0.5521301031112671 @@ -37485,13 +37485,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.033447265625, "learning_rate": 0.0007416901153890448, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6361586.0, "repeat_count": 0.0, - "routers_loss": 0.0009970148093998432, + "routers_loss": 0.0010283910669386387, "skip_count": 0.0, "step": 3946, "text_loss": 0.4046417772769928 @@ -37504,13 +37504,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.043212890625, + "grad_norm": 0.03955078125, "learning_rate": 0.0007414191150872818, - "loss": 0.0078, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6364954.0, "repeat_count": 0.0, - "routers_loss": 0.009517154656350613, + "routers_loss": 0.008222512900829315, "skip_count": 2.0, "step": 3948, "text_loss": 0.2803446352481842 @@ -37523,13 +37523,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.03564453125, "learning_rate": 0.0007411480222732583, - "loss": 0.0091, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6367660.0, "repeat_count": 0.0, - "routers_loss": 0.0012908667558804154, + "routers_loss": 0.001304348581470549, "skip_count": 0.0, "step": 3950, "text_loss": 0.45553359389305115 @@ -37542,13 +37542,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.03759765625, "learning_rate": 0.0007408768370508576, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6371585.0, "repeat_count": 0.0, - "routers_loss": 0.0015499353175982833, + "routers_loss": 0.0016345062758773565, "skip_count": 0.0, "step": 3952, "text_loss": 0.25424402952194214 @@ -37561,13 +37561,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007406055595239986, - "loss": 0.007, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6374365.0, "repeat_count": 0.0, - "routers_loss": 0.0005612325621768832, + "routers_loss": 0.0005097290268167853, "skip_count": 0.0, "step": 3954, "text_loss": 0.5856026411056519 @@ -37580,13 +37580,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.060546875, "learning_rate": 0.0007403341897966356, - "loss": 0.0063, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6377335.0, "repeat_count": 0.0, - "routers_loss": 0.0024961072485893965, + "routers_loss": 0.002482263371348381, "skip_count": 1.0, "step": 3956, "text_loss": 0.5145615339279175 @@ -37599,32 +37599,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0230712890625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0007400627279727574, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6380799.0, "repeat_count": 0.0, - "routers_loss": 0.0013171056052669883, + "routers_loss": 0.0011743451468646526, "skip_count": 0.0, "step": 3958, "text_loss": 0.31868961453437805 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 18.591722923393014, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0286865234375, "learning_rate": 0.0007397911741563892, - "loss": 0.0054, - "macro_f1": 0.3272727429866791, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, "num_tokens": 6383963.0, "repeat_count": 1.0, - "routers_loss": 0.012845510616898537, + "routers_loss": 0.009861881844699383, "skip_count": 0.0, "step": 3960, "text_loss": 0.21192194521427155 @@ -37637,13 +37637,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0390625, + "grad_norm": 0.0380859375, "learning_rate": 0.0007395195284515905, - "loss": 0.0099, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6387410.0, "repeat_count": 1.0, - "routers_loss": 0.003112874459475279, + "routers_loss": 0.004189098719507456, "skip_count": 0.0, "step": 3962, "text_loss": 0.5809708833694458 @@ -37656,13 +37656,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.036376953125, "learning_rate": 0.0007392477909624567, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6390670.0, "repeat_count": 0.0, - "routers_loss": 0.0019742189906537533, + "routers_loss": 0.001853612600825727, "skip_count": 0.0, "step": 3964, "text_loss": 0.48985618352890015 @@ -37675,13 +37675,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.0308837890625, "learning_rate": 0.0007389759617931182, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6393609.0, "repeat_count": 1.0, - "routers_loss": 0.003850853070616722, + "routers_loss": 0.003303771372884512, "skip_count": 0.0, "step": 3966, "text_loss": 0.28729453682899475 @@ -37694,13 +37694,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.0634765625, + "grad_norm": 0.10595703125, "learning_rate": 0.0007387040410477404, - "loss": 0.0057, + "loss": 0.0058, "macro_f1": 0.9452888369560242, "num_tokens": 6396608.0, "repeat_count": 1.0, - "routers_loss": 0.020281648263335228, + "routers_loss": 0.01791577786207199, "skip_count": 4.0, "step": 3968, "text_loss": 0.30386820435523987 @@ -37713,13 +37713,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.029541015625, "learning_rate": 0.0007384320288305235, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6399793.0, "repeat_count": 0.0, - "routers_loss": 0.0005419629742391407, + "routers_loss": 0.0005771282012574375, "skip_count": 0.0, "step": 3970, "text_loss": 0.47285011410713196 @@ -37732,13 +37732,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.032958984375, "learning_rate": 0.0007381599252457037, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6403365.0, "repeat_count": 0.0, - "routers_loss": 0.003040255280211568, + "routers_loss": 0.003010645741596818, "skip_count": 0.0, "step": 3972, "text_loss": 0.5313063859939575 @@ -37751,32 +37751,32 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.038818359375, "learning_rate": 0.000737887730397551, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6406205.0, "repeat_count": 1.0, - "routers_loss": 0.006762589327991009, + "routers_loss": 0.006457438692450523, "skip_count": 0.0, "step": 3974, "text_loss": 0.2323843240737915 }, { - "acc_repeat": 0.0, + "acc_repeat": 1.0, "acc_skip": 0.0, - "avg_layers": 28.0, + "avg_layers": 29.0, "epoch": 18.666862342236573, - "f1_execute": 0.9818181991577148, - "f1_repeat": 0.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.036865234375, "learning_rate": 0.0007376154443903713, - "loss": 0.0086, - "macro_f1": 0.3272727429866791, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, "num_tokens": 6409552.0, "repeat_count": 1.0, - "routers_loss": 0.01173968706279993, + "routers_loss": 0.010693981312215328, "skip_count": 0.0, "step": 3976, "text_loss": 0.6304101943969727 @@ -37789,13 +37789,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03662109375, "learning_rate": 0.0007373430673285051, "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6412386.0, "repeat_count": 1.0, - "routers_loss": 0.028297962620854378, + "routers_loss": 0.03116440214216709, "skip_count": 0.0, "step": 3978, "text_loss": 0.23448467254638672 @@ -37808,13 +37808,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.08447265625, + "grad_norm": 0.10009765625, "learning_rate": 0.0007370705993163278, - "loss": 0.011, + "loss": 0.0111, "macro_f1": 0.3272727429866791, "num_tokens": 6416054.0, "repeat_count": 1.0, - "routers_loss": 0.010761309415102005, + "routers_loss": 0.011973714455962181, "skip_count": 0.0, "step": 3980, "text_loss": 0.6371755599975586 @@ -37827,13 +37827,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.05224609375, "learning_rate": 0.0007367980404582497, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 6419238.0, "repeat_count": 1.0, - "routers_loss": 0.0057355971075594425, + "routers_loss": 0.005117347463965416, "skip_count": 2.0, "step": 3982, "text_loss": 0.19822923839092255 @@ -37846,13 +37846,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.0296630859375, "learning_rate": 0.0007365253908587158, - "loss": 0.005, + "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6422122.0, "repeat_count": 0.0, - "routers_loss": 0.0011142889270558953, + "routers_loss": 0.0010648667812347412, "skip_count": 0.0, "step": 3984, "text_loss": 0.566700279712677 @@ -37865,13 +37865,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0263671875, + "grad_norm": 0.025146484375, "learning_rate": 0.0007362526506222058, - "loss": 0.0045, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6425313.0, "repeat_count": 0.0, - "routers_loss": 0.005405326373875141, + "routers_loss": 0.005726494826376438, "skip_count": 0.0, "step": 3986, "text_loss": 0.6568437814712524 @@ -37884,13 +37884,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.0341796875, "learning_rate": 0.0007359798198532343, - "loss": 0.0043, + "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6428422.0, "repeat_count": 1.0, - "routers_loss": 0.005449058022350073, + "routers_loss": 0.004504100419580936, "skip_count": 0.0, "step": 3988, "text_loss": 0.598754346370697 @@ -37903,13 +37903,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007357068986563509, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6431512.0, "repeat_count": 0.0, - "routers_loss": 0.0020256424322724342, + "routers_loss": 0.0019837068393826485, "skip_count": 1.0, "step": 3990, "text_loss": 0.7152895927429199 @@ -37922,13 +37922,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.032470703125, "learning_rate": 0.0007354338871361393, - "loss": 0.0084, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6434358.0, "repeat_count": 0.0, - "routers_loss": 0.0027240889612585306, + "routers_loss": 0.0026031541638076305, "skip_count": 1.0, "step": 3992, "text_loss": 0.4986513555049896 @@ -37941,13 +37941,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.039306640625, "learning_rate": 0.000735160785397218, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6438175.0, "repeat_count": 0.0, - "routers_loss": 0.0026689881924539804, + "routers_loss": 0.0024831905029714108, "skip_count": 2.0, "step": 3994, "text_loss": 0.4406205713748932 @@ -37960,13 +37960,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.035400390625, "learning_rate": 0.0007348875935442401, - "loss": 0.0067, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6441228.0, "repeat_count": 0.0, - "routers_loss": 0.0010014307918027043, + "routers_loss": 0.0008635876583866775, "skip_count": 0.0, "step": 3996, "text_loss": 0.48884135484695435 @@ -37979,13 +37979,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.03271484375, "learning_rate": 0.0007346143116818932, - "loss": 0.0046, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6444318.0, "repeat_count": 0.0, - "routers_loss": 0.004282998852431774, + "routers_loss": 0.004007008858025074, "skip_count": 0.0, "step": 3998, "text_loss": 0.6669428944587708 @@ -37998,13 +37998,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.08203125, "learning_rate": 0.0007343409399148994, - "loss": 0.0092, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6448317.0, "repeat_count": 0.0, - "routers_loss": 0.0031171543523669243, + "routers_loss": 0.0031380734872072935, "skip_count": 0.0, "step": 4000, "text_loss": 0.6468493938446045 @@ -38017,13 +38017,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.02392578125, "learning_rate": 0.0007340674783480154, - "loss": 0.0077, + "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 6451673.0, "repeat_count": 0.0, - "routers_loss": 0.005329967010766268, + "routers_loss": 0.004996029660105705, "skip_count": 0.0, "step": 4002, "text_loss": 0.28135430812835693 @@ -38036,13 +38036,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.037841796875, "learning_rate": 0.0007337939270860323, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.3272727429866791, "num_tokens": 6456372.0, "repeat_count": 1.0, - "routers_loss": 0.038046106696128845, + "routers_loss": 0.03784399852156639, "skip_count": 0.0, "step": 4004, "text_loss": 0.41668644547462463 @@ -38055,32 +38055,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.040283203125, "learning_rate": 0.0007335202862337753, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6459047.0, "repeat_count": 0.0, - "routers_loss": 0.0013881187187507749, + "routers_loss": 0.0011750755365937948, "skip_count": 0.0, "step": 4006, "text_loss": 0.6853910684585571 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 25.0, + "acc_skip": 0.75, + "avg_layers": 26.0, "epoch": 18.817141179923688, - "f1_execute": 1.0, + "f1_execute": 0.978723406791687, "f1_repeat": 1.0, - "f1_skip": 1.0, - "grad_norm": 0.044189453125, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, "learning_rate": 0.000733246555896104, - "loss": 0.0059, - "macro_f1": 1.0, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, "num_tokens": 6462390.0, "repeat_count": 1.0, - "routers_loss": 0.01348043605685234, + "routers_loss": 0.01630394533276558, "skip_count": 4.0, "step": 4008, "text_loss": 0.7110592126846313 @@ -38093,13 +38093,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.049560546875, "learning_rate": 0.0007329727361779124, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6466057.0, "repeat_count": 0.0, - "routers_loss": 0.0051529803313314915, + "routers_loss": 0.0052404399029910564, "skip_count": 2.0, "step": 4010, "text_loss": 0.13856995105743408 @@ -38112,13 +38112,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03759765625, "learning_rate": 0.000732698827184129, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6468878.0, "repeat_count": 0.0, - "routers_loss": 0.002958883298560977, + "routers_loss": 0.002138581359758973, "skip_count": 0.0, "step": 4012, "text_loss": 0.3999565839767456 @@ -38131,13 +38131,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.036376953125, "learning_rate": 0.000732424829019716, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6472364.0, "repeat_count": 0.0, - "routers_loss": 0.0038471813313663006, + "routers_loss": 0.0037466560024768114, "skip_count": 0.0, "step": 4014, "text_loss": 0.28161346912384033 @@ -38150,13 +38150,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0306396484375, "learning_rate": 0.0007321507417896699, - "loss": 0.0087, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 6475379.0, "repeat_count": 0.0, - "routers_loss": 0.0010916640749201179, + "routers_loss": 0.0010469373082742095, "skip_count": 0.0, "step": 4016, "text_loss": 1.0490952730178833 @@ -38169,13 +38169,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.06591796875, "learning_rate": 0.0007318765655990218, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6478585.0, "repeat_count": 0.0, - "routers_loss": 0.00946822389960289, + "routers_loss": 0.009968385100364685, "skip_count": 2.0, "step": 4018, "text_loss": 0.31696680188179016 @@ -38188,13 +38188,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0240478515625, "learning_rate": 0.0007316023005528362, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 6484153.0, "repeat_count": 0.0, - "routers_loss": 0.0027165759820491076, + "routers_loss": 0.002349073765799403, "skip_count": 1.0, "step": 4020, "text_loss": 0.30981555581092834 @@ -38207,13 +38207,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.02880859375, + "grad_norm": 0.0299072265625, "learning_rate": 0.0007313279467562124, - "loss": 0.0051, + "loss": 0.0053, "macro_f1": 0.9452888369560242, "num_tokens": 6487029.0, "repeat_count": 1.0, - "routers_loss": 0.012701411731541157, + "routers_loss": 0.011854278855025768, "skip_count": 4.0, "step": 4022, "text_loss": 0.9689550399780273 @@ -38226,13 +38226,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.050537109375, + "grad_norm": 0.04541015625, "learning_rate": 0.0007310535043142829, - "loss": 0.0079, + "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6490315.0, "repeat_count": 1.0, - "routers_loss": 0.010197490453720093, + "routers_loss": 0.00908346101641655, "skip_count": 3.0, "step": 4024, "text_loss": 0.1705625057220459 @@ -38245,13 +38245,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0400390625, + "grad_norm": 0.039306640625, "learning_rate": 0.0007307789733322146, - "loss": 0.0097, + "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 6493921.0, "repeat_count": 0.0, - "routers_loss": 0.0008188873762264848, + "routers_loss": 0.0007360641611739993, "skip_count": 0.0, "step": 4026, "text_loss": 0.6252996325492859 @@ -38264,13 +38264,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06689453125, + "grad_norm": 0.087890625, "learning_rate": 0.0007305043539152083, - "loss": 0.007, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6496689.0, "repeat_count": 0.0, - "routers_loss": 0.0018946458585560322, + "routers_loss": 0.0017757206223905087, "skip_count": 0.0, "step": 4028, "text_loss": 0.40533265471458435 @@ -38283,13 +38283,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.03271484375, "learning_rate": 0.000730229646168499, - "loss": 0.0078, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6500090.0, "repeat_count": 0.0, - "routers_loss": 0.0023306645452976227, + "routers_loss": 0.0022657213266938925, "skip_count": 0.0, "step": 4030, "text_loss": 0.25954708456993103 @@ -38302,13 +38302,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.039794921875, "learning_rate": 0.0007299548501973548, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 6503023.0, "repeat_count": 0.0, - "routers_loss": 0.002005136338993907, + "routers_loss": 0.0021747269202023745, "skip_count": 0.0, "step": 4032, "text_loss": 0.6223418712615967 @@ -38321,13 +38321,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.0390625, "learning_rate": 0.0007296799661070782, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6506382.0, "repeat_count": 0.0, - "routers_loss": 0.00668578315526247, + "routers_loss": 0.006400502752512693, "skip_count": 4.0, "step": 4034, "text_loss": 0.6873653531074524 @@ -38340,13 +38340,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.028076171875, "learning_rate": 0.0007294049940030055, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 0.3272727429866791, "num_tokens": 6509194.0, "repeat_count": 0.0, - "routers_loss": 0.021298008039593697, + "routers_loss": 0.0197185929864645, "skip_count": 1.0, "step": 4036, "text_loss": 0.16156800091266632 @@ -38359,13 +38359,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0458984375, + "grad_norm": 0.04345703125, "learning_rate": 0.0007291299339905059, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6512271.0, "repeat_count": 0.0, - "routers_loss": 0.001004312071017921, + "routers_loss": 0.0009541353792883456, "skip_count": 0.0, "step": 4038, "text_loss": 0.5038442015647888 @@ -38378,13 +38378,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.036376953125, "learning_rate": 0.0007288547861749838, - "loss": 0.0065, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6516403.0, "repeat_count": 0.0, - "routers_loss": 0.007993367500603199, + "routers_loss": 0.008226391859352589, "skip_count": 2.0, "step": 4040, "text_loss": 0.3706657588481903 @@ -38397,13 +38397,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03515625, + "grad_norm": 0.0380859375, "learning_rate": 0.0007285795506618758, - "loss": 0.0062, + "loss": 0.0063, "macro_f1": 0.3272727429866791, "num_tokens": 6519310.0, "repeat_count": 0.0, - "routers_loss": 0.015058980323374271, + "routers_loss": 0.017001887783408165, "skip_count": 1.0, "step": 4042, "text_loss": 0.24296723306179047 @@ -38416,13 +38416,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.054443359375, + "grad_norm": 0.0615234375, "learning_rate": 0.0007283042275566528, "loss": 0.0125, "macro_f1": 0.6666666865348816, "num_tokens": 6521979.0, "repeat_count": 0.0, - "routers_loss": 0.016352638602256775, + "routers_loss": 0.01666323095560074, "skip_count": 2.0, "step": 4044, "text_loss": 0.36904850602149963 @@ -38435,13 +38435,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0201416015625, + "grad_norm": 0.019775390625, "learning_rate": 0.0007280288169648192, - "loss": 0.0044, + "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6524976.0, "repeat_count": 0.0, - "routers_loss": 0.0008094423683360219, + "routers_loss": 0.0007593175978399813, "skip_count": 0.0, "step": 4046, "text_loss": 0.7312731146812439 @@ -38454,13 +38454,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.0390625, "learning_rate": 0.0007277533189919127, - "loss": 0.0061, + "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 6528638.0, "repeat_count": 1.0, - "routers_loss": 0.005490938201546669, + "routers_loss": 0.005652119871228933, "skip_count": 1.0, "step": 4048, "text_loss": 0.23326151072978973 @@ -38473,13 +38473,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026123046875, + "grad_norm": 0.0286865234375, "learning_rate": 0.0007274777337435046, - "loss": 0.0055, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6532193.0, "repeat_count": 0.0, - "routers_loss": 0.009560001082718372, + "routers_loss": 0.010509157553315163, "skip_count": 2.0, "step": 4050, "text_loss": 0.23918013274669647 @@ -38492,13 +38492,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.037841796875, "learning_rate": 0.0007272020613251999, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6534994.0, "repeat_count": 0.0, - "routers_loss": 0.0023573292419314384, + "routers_loss": 0.002153293928131461, "skip_count": 0.0, "step": 4052, "text_loss": 0.5890526175498962 @@ -38511,13 +38511,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.029296875, + "grad_norm": 0.04150390625, "learning_rate": 0.0007269263018426367, - "loss": 0.0048, + "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6537469.0, "repeat_count": 1.0, - "routers_loss": 0.0012750910827890038, + "routers_loss": 0.0018494052346795797, "skip_count": 2.0, "step": 4054, "text_loss": 0.36058738827705383 @@ -38530,13 +38530,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0546875, + "grad_norm": 0.0693359375, "learning_rate": 0.0007266504554014866, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6541271.0, "repeat_count": 0.0, - "routers_loss": 0.0006701929378323257, + "routers_loss": 0.0007579320226795971, "skip_count": 0.0, "step": 4056, "text_loss": 0.4089007079601288 @@ -38549,13 +38549,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.04052734375, "learning_rate": 0.0007263745221074545, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.6601307392120361, "num_tokens": 6544293.0, "repeat_count": 1.0, - "routers_loss": 0.061707694083452225, + "routers_loss": 0.06202420964837074, "skip_count": 2.0, "step": 4058, "text_loss": 0.2226305454969406 @@ -38568,13 +38568,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.027587890625, + "grad_norm": 0.0286865234375, "learning_rate": 0.0007260985020662784, - "loss": 0.005, + "loss": 0.0049, "macro_f1": 0.5934640765190125, "num_tokens": 6547640.0, "repeat_count": 0.0, - "routers_loss": 0.04534700885415077, + "routers_loss": 0.044639844447374344, "skip_count": 3.0, "step": 4060, "text_loss": 0.23004353046417236 @@ -38587,13 +38587,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.017822265625, + "grad_norm": 0.0206298828125, "learning_rate": 0.0007258223953837298, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6550840.0, "repeat_count": 1.0, - "routers_loss": 0.004326729103922844, + "routers_loss": 0.004215611144900322, "skip_count": 0.0, "step": 4062, "text_loss": 0.2891770601272583 @@ -38606,13 +38606,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044189453125, + "grad_norm": 0.038818359375, "learning_rate": 0.0007255462021656132, - "loss": 0.0068, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6554122.0, "repeat_count": 0.0, - "routers_loss": 0.0009951743995770812, + "routers_loss": 0.0011056234361603856, "skip_count": 0.0, "step": 4064, "text_loss": 0.7485370635986328 @@ -38625,13 +38625,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.035400390625, "learning_rate": 0.0007252699225177666, - "loss": 0.0082, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6557138.0, "repeat_count": 0.0, - "routers_loss": 0.008738798089325428, + "routers_loss": 0.008258933201432228, "skip_count": 2.0, "step": 4066, "text_loss": 0.25219282507896423 @@ -38644,13 +38644,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.03759765625, "learning_rate": 0.0007249935565460606, - "loss": 0.0044, + "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6560654.0, "repeat_count": 0.0, - "routers_loss": 0.004576306790113449, + "routers_loss": 0.005102175287902355, "skip_count": 0.0, "step": 4068, "text_loss": 0.5553314089775085 @@ -38663,13 +38663,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02734375, + "grad_norm": 0.03076171875, "learning_rate": 0.0007247171043563994, - "loss": 0.0059, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6563814.0, "repeat_count": 0.0, - "routers_loss": 0.013026291504502296, + "routers_loss": 0.01283820066601038, "skip_count": 2.0, "step": 4070, "text_loss": 0.15729956328868866 @@ -38682,13 +38682,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0206298828125, + "grad_norm": 0.0211181640625, "learning_rate": 0.0007244405660547199, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6567060.0, "repeat_count": 0.0, - "routers_loss": 0.0010598953813314438, + "routers_loss": 0.0009684927063062787, "skip_count": 0.0, "step": 4072, "text_loss": 0.3725031912326813 @@ -38701,13 +38701,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.021484375, + "grad_norm": 0.01953125, "learning_rate": 0.000724163941746992, - "loss": 0.0061, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6571608.0, "repeat_count": 0.0, - "routers_loss": 0.0008197802817448974, + "routers_loss": 0.0007890827837400138, "skip_count": 0.0, "step": 4074, "text_loss": 0.8438301682472229 @@ -38722,11 +38722,11 @@ "f1_skip": 1.0, "grad_norm": 0.02734375, "learning_rate": 0.0007238872315392189, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 6575214.0, "repeat_count": 1.0, - "routers_loss": 0.004072689451277256, + "routers_loss": 0.0040600355714559555, "skip_count": 1.0, "step": 4076, "text_loss": 0.5923112034797668 @@ -38739,13 +38739,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.024169921875, "learning_rate": 0.0007236104355374363, - "loss": 0.004, + "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6578383.0, "repeat_count": 0.0, - "routers_loss": 0.0024594077840447426, + "routers_loss": 0.0024899677373468876, "skip_count": 2.0, "step": 4078, "text_loss": 0.20302526652812958 @@ -38758,13 +38758,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.057373046875, + "grad_norm": 0.05517578125, "learning_rate": 0.000723333553847713, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6582175.0, "repeat_count": 0.0, - "routers_loss": 0.0060209049843251705, + "routers_loss": 0.006120906211435795, "skip_count": 2.0, "step": 4080, "text_loss": 0.5400223731994629 @@ -38777,13 +38777,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07958984375, + "grad_norm": 0.06787109375, "learning_rate": 0.0007230565865761504, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6585516.0, "repeat_count": 0.0, - "routers_loss": 0.002700155135244131, + "routers_loss": 0.0029941233806312084, "skip_count": 0.0, "step": 4082, "text_loss": 0.19460804760456085 @@ -38796,13 +38796,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.08056640625, + "grad_norm": 0.07373046875, "learning_rate": 0.0007227795338288831, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 6588266.0, "repeat_count": 0.0, - "routers_loss": 0.009378589689731598, + "routers_loss": 0.009357884526252747, "skip_count": 2.0, "step": 4084, "text_loss": 0.35237613320350647 @@ -38815,13 +38815,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.04052734375, "learning_rate": 0.0007225023957120782, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6591009.0, "repeat_count": 0.0, - "routers_loss": 0.0025940060149878263, + "routers_loss": 0.0023083325941115618, "skip_count": 2.0, "step": 4086, "text_loss": 0.4336731433868408 @@ -38834,13 +38834,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0218505859375, + "grad_norm": 0.0211181640625, "learning_rate": 0.0007222251723319356, - "loss": 0.0035, + "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6594472.0, "repeat_count": 0.0, - "routers_loss": 0.0009030649089254439, + "routers_loss": 0.0008416616474278271, "skip_count": 0.0, "step": 4088, "text_loss": 0.6390535831451416 @@ -38853,13 +38853,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.045166015625, "learning_rate": 0.0007219478637946877, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 6597477.0, "repeat_count": 0.0, - "routers_loss": 0.005229895934462547, + "routers_loss": 0.004390760324895382, "skip_count": 1.0, "step": 4090, "text_loss": 0.525839626789093 @@ -38872,13 +38872,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.0272216796875, "learning_rate": 0.0007216704702065997, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6600431.0, "repeat_count": 0.0, - "routers_loss": 0.0010594666237011552, + "routers_loss": 0.0010311100631952286, "skip_count": 0.0, "step": 4092, "text_loss": 0.5310423374176025 @@ -38891,13 +38891,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02197265625, + "grad_norm": 0.0228271484375, "learning_rate": 0.0007213929916739695, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6603899.0, "repeat_count": 0.0, - "routers_loss": 0.004303699359297752, + "routers_loss": 0.0032497600186616182, "skip_count": 1.0, "step": 4094, "text_loss": 0.2775326073169708 @@ -38910,13 +38910,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.044189453125, "learning_rate": 0.000721115428303127, - "loss": 0.0083, + "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 6606544.0, "repeat_count": 1.0, - "routers_loss": 0.004739399533718824, + "routers_loss": 0.004692315589636564, "skip_count": 3.0, "step": 4096, "text_loss": 0.6667124032974243 @@ -38929,13 +38929,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.0274658203125, "learning_rate": 0.0007208377802004353, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6610097.0, "repeat_count": 0.0, - "routers_loss": 0.0007414906867779791, + "routers_loss": 0.0007263485458679497, "skip_count": 0.0, "step": 4098, "text_loss": 0.6916406750679016 @@ -38948,13 +38948,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0274658203125, "learning_rate": 0.0007205600474722897, - "loss": 0.0059, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6613836.0, "repeat_count": 0.0, - "routers_loss": 0.001866258797235787, + "routers_loss": 0.0017989488551393151, "skip_count": 0.0, "step": 4100, "text_loss": 0.5257929563522339 @@ -38967,13 +38967,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.0284423828125, "learning_rate": 0.000720282230225118, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6616780.0, "repeat_count": 0.0, - "routers_loss": 0.0013150086160749197, + "routers_loss": 0.0011308686807751656, "skip_count": 1.0, "step": 4102, "text_loss": 0.4410906732082367 @@ -38986,13 +38986,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.03173828125, "learning_rate": 0.0007200043285653799, - "loss": 0.0064, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6620110.0, "repeat_count": 0.0, - "routers_loss": 0.0021148507948964834, + "routers_loss": 0.002058265497907996, "skip_count": 2.0, "step": 4104, "text_loss": 0.8581191897392273 @@ -39005,13 +39005,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.040283203125, "learning_rate": 0.0007197263425995681, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 6622585.0, "repeat_count": 1.0, - "routers_loss": 0.0015671581495553255, + "routers_loss": 0.0017528717871755362, "skip_count": 0.0, "step": 4106, "text_loss": 0.5000449419021606 @@ -39024,13 +39024,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0245361328125, + "grad_norm": 0.02587890625, "learning_rate": 0.0007194482724342075, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6626356.0, "repeat_count": 0.0, - "routers_loss": 0.0020829052664339542, + "routers_loss": 0.0021995846182107925, "skip_count": 0.0, "step": 4108, "text_loss": 0.401346892118454 @@ -39043,13 +39043,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0289306640625, "learning_rate": 0.0007191701181758547, - "loss": 0.0073, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6629738.0, "repeat_count": 0.0, - "routers_loss": 0.0013877892633900046, + "routers_loss": 0.0014869922306388617, "skip_count": 0.0, "step": 4110, "text_loss": 0.9598422050476074 @@ -39062,13 +39062,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.0242919921875, "learning_rate": 0.0007188918799310993, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 6632807.0, "repeat_count": 0.0, - "routers_loss": 0.0012319361558184028, + "routers_loss": 0.0012853415682911873, "skip_count": 0.0, "step": 4112, "text_loss": 0.3996548354625702 @@ -39081,13 +39081,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.029296875, "learning_rate": 0.0007186135578065627, - "loss": 0.0075, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6636227.0, "repeat_count": 0.0, - "routers_loss": 0.0009015969699248672, + "routers_loss": 0.0009887361666187644, "skip_count": 0.0, "step": 4114, "text_loss": 0.4127283990383148 @@ -39100,13 +39100,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.04541015625, "learning_rate": 0.0007183351519088982, - "loss": 0.0066, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6639443.0, "repeat_count": 0.0, - "routers_loss": 0.006493544206023216, + "routers_loss": 0.006282114889472723, "skip_count": 1.0, "step": 4116, "text_loss": 0.20028606057167053 @@ -39119,13 +39119,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.061767578125, "learning_rate": 0.0007180566623447917, - "loss": 0.0115, + "loss": 0.0114, "macro_f1": 0.6603773832321167, "num_tokens": 6642127.0, "repeat_count": 1.0, - "routers_loss": 0.008949270471930504, + "routers_loss": 0.008101986721158028, "skip_count": 0.0, "step": 4118, "text_loss": 0.763931155204773 @@ -39138,13 +39138,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.0291748046875, "learning_rate": 0.0007177780892209607, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6645376.0, "repeat_count": 0.0, - "routers_loss": 0.0019743547309190035, + "routers_loss": 0.001953610684722662, "skip_count": 0.0, "step": 4120, "text_loss": 0.42317715287208557 @@ -39157,13 +39157,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.034912109375, "learning_rate": 0.0007174994326441551, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6648150.0, "repeat_count": 0.0, - "routers_loss": 0.003454099874943495, + "routers_loss": 0.003279355587437749, "skip_count": 0.0, "step": 4122, "text_loss": 0.19656142592430115 @@ -39176,13 +39176,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.031005859375, "learning_rate": 0.0007172206927211567, - "loss": 0.0055, + "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6650935.0, "repeat_count": 0.0, - "routers_loss": 0.0032328376546502113, + "routers_loss": 0.0032076311763375998, "skip_count": 0.0, "step": 4124, "text_loss": 0.13608409464359283 @@ -39195,13 +39195,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0303955078125, "learning_rate": 0.0007169418695587791, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6654464.0, "repeat_count": 0.0, - "routers_loss": 0.0041675688698887825, + "routers_loss": 0.004065621178597212, "skip_count": 2.0, "step": 4126, "text_loss": 0.4882086217403412 @@ -39214,13 +39214,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.031005859375, "learning_rate": 0.0007166629632638678, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6657749.0, "repeat_count": 0.0, - "routers_loss": 0.000975916744209826, + "routers_loss": 0.0009243001695722342, "skip_count": 0.0, "step": 4128, "text_loss": 0.31632331013679504 @@ -39233,13 +39233,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0264892578125, + "grad_norm": 0.02783203125, "learning_rate": 0.0007163839739433003, - "loss": 0.008, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6660997.0, "repeat_count": 0.0, - "routers_loss": 0.002182615688070655, + "routers_loss": 0.0018459554994478822, "skip_count": 0.0, "step": 4130, "text_loss": 0.6123947501182556 @@ -39252,13 +39252,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.033935546875, "learning_rate": 0.0007161049017039857, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.8820862174034119, "num_tokens": 6663542.0, "repeat_count": 2.0, - "routers_loss": 0.03051452897489071, + "routers_loss": 0.030032536014914513, "skip_count": 2.0, "step": 4132, "text_loss": 0.6985659003257751 @@ -39271,13 +39271,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0206298828125, + "grad_norm": 0.019775390625, "learning_rate": 0.0007158257466528652, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6666178.0, "repeat_count": 0.0, - "routers_loss": 0.0013405663194134831, + "routers_loss": 0.0013813833938911557, "skip_count": 0.0, "step": 4134, "text_loss": 0.38380664587020874 @@ -39290,13 +39290,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.01953125, + "grad_norm": 0.021484375, "learning_rate": 0.0007155465088969114, - "loss": 0.0079, + "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 6668852.0, "repeat_count": 0.0, - "routers_loss": 0.00536607438698411, + "routers_loss": 0.00513424864038825, "skip_count": 3.0, "step": 4136, "text_loss": 0.49724283814430237 @@ -39309,13 +39309,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.023193359375, + "grad_norm": 0.0228271484375, "learning_rate": 0.0007152671885431288, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6671430.0, "repeat_count": 0.0, - "routers_loss": 0.0004998469958081841, + "routers_loss": 0.0005165594047866762, "skip_count": 0.0, "step": 4138, "text_loss": 0.666959822177887 @@ -39328,13 +39328,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044189453125, + "grad_norm": 0.047119140625, "learning_rate": 0.0007149877856985535, - "loss": 0.0082, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 6675215.0, "repeat_count": 0.0, - "routers_loss": 0.0017356832977384329, + "routers_loss": 0.001685218419879675, "skip_count": 0.0, "step": 4140, "text_loss": 0.3127259612083435 @@ -39347,13 +39347,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0245361328125, + "grad_norm": 0.0277099609375, "learning_rate": 0.000714708300470253, - "loss": 0.0058, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 6678505.0, "repeat_count": 0.0, - "routers_loss": 0.003699234686791897, + "routers_loss": 0.004025314934551716, "skip_count": 0.0, "step": 4142, "text_loss": 0.3179470896720886 @@ -39366,13 +39366,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.043212890625, "learning_rate": 0.0007144287329653269, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 6681127.0, "repeat_count": 1.0, - "routers_loss": 0.005084970500320196, + "routers_loss": 0.005965690594166517, "skip_count": 0.0, "step": 4144, "text_loss": 0.3862907886505127 @@ -39385,13 +39385,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.040771484375, + "grad_norm": 0.039794921875, "learning_rate": 0.0007141490832909058, - "loss": 0.0074, + "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6683968.0, "repeat_count": 0.0, - "routers_loss": 0.013118764385581017, + "routers_loss": 0.012896374799311161, "skip_count": 1.0, "step": 4146, "text_loss": 0.48156118392944336 @@ -39404,13 +39404,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.034912109375, "learning_rate": 0.0007138693515541519, - "loss": 0.005, + "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6687196.0, "repeat_count": 0.0, - "routers_loss": 0.0006807957543060184, + "routers_loss": 0.0006367767928168178, "skip_count": 1.0, "step": 4148, "text_loss": 0.676702082157135 @@ -39423,13 +39423,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03125, + "grad_norm": 0.030029296875, "learning_rate": 0.0007135895378622592, - "loss": 0.0076, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 6689972.0, "repeat_count": 0.0, - "routers_loss": 0.004619150888174772, + "routers_loss": 0.004532640799880028, "skip_count": 3.0, "step": 4150, "text_loss": 0.5865558981895447 @@ -39442,13 +39442,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.040283203125, "learning_rate": 0.0007133096423224526, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.3272727429866791, "num_tokens": 6693568.0, "repeat_count": 1.0, - "routers_loss": 0.0404328815639019, + "routers_loss": 0.0377078577876091, "skip_count": 0.0, "step": 4152, "text_loss": 0.2790502607822418 @@ -39461,13 +39461,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.056640625, "learning_rate": 0.0007130296650419885, - "loss": 0.0071, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6696468.0, "repeat_count": 0.0, - "routers_loss": 0.0048319315537810326, + "routers_loss": 0.004455826710909605, "skip_count": 1.0, "step": 4154, "text_loss": 0.5869500041007996 @@ -39480,13 +39480,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.050048828125, + "grad_norm": 0.0654296875, "learning_rate": 0.0007127496061281551, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6699307.0, "repeat_count": 0.0, - "routers_loss": 0.0022895359434187412, + "routers_loss": 0.001998464809730649, "skip_count": 0.0, "step": 4156, "text_loss": 0.6931945085525513 @@ -39499,13 +39499,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.035400390625, "learning_rate": 0.0007124694656882713, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6702647.0, "repeat_count": 3.0, - "routers_loss": 0.004655756987631321, + "routers_loss": 0.004117495380342007, "skip_count": 0.0, "step": 4158, "text_loss": 0.4325876832008362 @@ -39518,13 +39518,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0189208984375, + "grad_norm": 0.0205078125, "learning_rate": 0.0007121892438296874, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 6705964.0, "repeat_count": 0.0, - "routers_loss": 0.0014499713433906436, + "routers_loss": 0.0014713290147483349, "skip_count": 0.0, "step": 4160, "text_loss": 0.3672060966491699 @@ -39537,13 +39537,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04296875, + "grad_norm": 0.04345703125, "learning_rate": 0.0007119089406597849, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 6710182.0, "repeat_count": 0.0, - "routers_loss": 0.0039377836510539055, + "routers_loss": 0.0037311650812625885, "skip_count": 1.0, "step": 4162, "text_loss": 0.6643805503845215 @@ -39556,13 +39556,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03662109375, "learning_rate": 0.0007116285562859767, - "loss": 0.0059, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 6713410.0, "repeat_count": 0.0, - "routers_loss": 0.006864873692393303, + "routers_loss": 0.006017287727445364, "skip_count": 0.0, "step": 4164, "text_loss": 0.4606415927410126 @@ -39575,13 +39575,13 @@ "f1_execute": 0.9545454382896423, "f1_repeat": 0.5, "f1_skip": 1.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.05419921875, "learning_rate": 0.0007113480908157065, - "loss": 0.0109, + "loss": 0.0108, "macro_f1": 0.8181818723678589, "num_tokens": 6716056.0, "repeat_count": 3.0, - "routers_loss": 0.08587442338466644, + "routers_loss": 0.08640352636575699, "skip_count": 4.0, "step": 4166, "text_loss": 0.3139408528804779 @@ -39594,13 +39594,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0311279296875, "learning_rate": 0.0007110675443564491, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6719497.0, "repeat_count": 0.0, - "routers_loss": 0.001434682053513825, + "routers_loss": 0.0012731150491163135, "skip_count": 0.0, "step": 4168, "text_loss": 0.7283861637115479 @@ -39613,13 +39613,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.0262451171875, "learning_rate": 0.0007107869170157108, - "loss": 0.0056, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6722297.0, "repeat_count": 0.0, - "routers_loss": 0.0018422538414597511, + "routers_loss": 0.0021509863436222076, "skip_count": 2.0, "step": 4170, "text_loss": 0.5767703056335449 @@ -39632,13 +39632,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.0380859375, "learning_rate": 0.000710506208901028, - "loss": 0.0083, + "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 6725762.0, "repeat_count": 0.0, - "routers_loss": 0.002943754428997636, + "routers_loss": 0.00257494836114347, "skip_count": 1.0, "step": 4172, "text_loss": 0.33571913838386536 @@ -39651,13 +39651,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.041748046875, "learning_rate": 0.000710225420119969, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 6728436.0, "repeat_count": 1.0, - "routers_loss": 0.00920829363167286, + "routers_loss": 0.00943201594054699, "skip_count": 3.0, "step": 4174, "text_loss": 0.6849368810653687 @@ -39670,13 +39670,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.030517578125, "learning_rate": 0.0007099445507801323, - "loss": 0.0062, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6731427.0, "repeat_count": 0.0, - "routers_loss": 0.010877607390284538, + "routers_loss": 0.01046718005090952, "skip_count": 2.0, "step": 4176, "text_loss": 0.3346157670021057 @@ -39689,13 +39689,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.05224609375, "learning_rate": 0.0007096636009891477, - "loss": 0.0095, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6734800.0, "repeat_count": 0.0, - "routers_loss": 0.0007745221955701709, + "routers_loss": 0.0007813365664333105, "skip_count": 0.0, "step": 4178, "text_loss": 0.49989959597587585 @@ -39708,13 +39708,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.032958984375, "learning_rate": 0.000709382570854676, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6738244.0, "repeat_count": 0.0, - "routers_loss": 0.002755505731329322, + "routers_loss": 0.002825600327923894, "skip_count": 0.0, "step": 4180, "text_loss": 0.15744923055171967 @@ -39727,13 +39727,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.03857421875, "learning_rate": 0.0007091014604844078, - "loss": 0.0078, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6741695.0, "repeat_count": 0.0, - "routers_loss": 0.0018092440441250801, + "routers_loss": 0.0017124463338404894, "skip_count": 0.0, "step": 4182, "text_loss": 0.3752405643463135 @@ -39746,13 +39746,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0230712890625, "learning_rate": 0.0007088202699860655, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 6744882.0, "repeat_count": 1.0, - "routers_loss": 0.005326499231159687, + "routers_loss": 0.005134924780577421, "skip_count": 3.0, "step": 4184, "text_loss": 0.18534569442272186 @@ -39765,13 +39765,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.01904296875, "learning_rate": 0.000708538999467402, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6747811.0, "repeat_count": 0.0, - "routers_loss": 0.0022658067755401134, + "routers_loss": 0.002371585462242365, "skip_count": 1.0, "step": 4186, "text_loss": 0.6251029968261719 @@ -39784,13 +39784,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.080078125, + "grad_norm": 0.064453125, "learning_rate": 0.0007082576490362004, - "loss": 0.0055, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6750765.0, "repeat_count": 0.0, - "routers_loss": 0.0022298030089586973, + "routers_loss": 0.002088436856865883, "skip_count": 0.0, "step": 4188, "text_loss": 0.35471436381340027 @@ -39803,13 +39803,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0255126953125, "learning_rate": 0.000707976218800275, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 6754021.0, "repeat_count": 0.0, - "routers_loss": 0.0013450054684653878, + "routers_loss": 0.0012272283202037215, "skip_count": 0.0, "step": 4190, "text_loss": 0.5737302899360657 @@ -39822,13 +39822,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.07763671875, "learning_rate": 0.0007076947088674701, - "loss": 0.0064, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6756793.0, "repeat_count": 0.0, - "routers_loss": 0.0026260579470545053, + "routers_loss": 0.0026050808373838663, "skip_count": 0.0, "step": 4192, "text_loss": 0.526336669921875 @@ -39841,13 +39841,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.054931640625, "learning_rate": 0.000707413119345661, - "loss": 0.0084, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6760221.0, "repeat_count": 0.0, - "routers_loss": 0.0014576761750504375, + "routers_loss": 0.0013151296880096197, "skip_count": 0.0, "step": 4194, "text_loss": 0.5678895711898804 @@ -39860,13 +39860,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.037353515625, "learning_rate": 0.0007071314503427532, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6763721.0, "repeat_count": 0.0, - "routers_loss": 0.00165031966753304, + "routers_loss": 0.001528652966953814, "skip_count": 0.0, "step": 4196, "text_loss": 0.7640175223350525 @@ -39879,13 +39879,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.0240478515625, "learning_rate": 0.0007068497019666829, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6768581.0, "repeat_count": 0.0, - "routers_loss": 0.0017519505927339196, + "routers_loss": 0.0019202446565032005, "skip_count": 0.0, "step": 4198, "text_loss": 0.41878414154052734 @@ -39904,7 +39904,7 @@ "macro_f1": 0.6666666865348816, "num_tokens": 6772758.0, "repeat_count": 0.0, - "routers_loss": 0.005213241558521986, + "routers_loss": 0.004667408298701048, "skip_count": 1.0, "step": 4200, "text_loss": 0.3550313413143158 @@ -39917,13 +39917,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.048828125, + "grad_norm": 0.050537109375, "learning_rate": 0.0007062859675269513, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6776671.0, "repeat_count": 3.0, - "routers_loss": 0.004372407682240009, + "routers_loss": 0.00568761583417654, "skip_count": 0.0, "step": 4202, "text_loss": 0.1707649976015091 @@ -39936,13 +39936,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033203125, + "grad_norm": 0.03271484375, "learning_rate": 0.0007060039816793141, - "loss": 0.0073, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6780284.0, "repeat_count": 0.0, - "routers_loss": 0.003470032475888729, + "routers_loss": 0.0030401297844946384, "skip_count": 0.0, "step": 4204, "text_loss": 0.2686377167701721 @@ -39955,13 +39955,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.04541015625, "learning_rate": 0.0007057219168905625, - "loss": 0.0067, + "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 6783525.0, "repeat_count": 1.0, - "routers_loss": 0.003391953418031335, + "routers_loss": 0.003353122156113386, "skip_count": 5.0, "step": 4206, "text_loss": 0.5235374569892883 @@ -39974,13 +39974,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.026123046875, "learning_rate": 0.000705439773268784, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6787691.0, "repeat_count": 0.0, - "routers_loss": 0.0013759827706962824, + "routers_loss": 0.0016532237641513348, "skip_count": 1.0, "step": 4208, "text_loss": 0.5002681612968445 @@ -39993,13 +39993,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.03662109375, "learning_rate": 0.0007051575509220972, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 6790833.0, "repeat_count": 0.0, - "routers_loss": 0.0011075466172769666, + "routers_loss": 0.0011808308772742748, "skip_count": 0.0, "step": 4210, "text_loss": 0.7251001596450806 @@ -40012,13 +40012,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.04443359375, "learning_rate": 0.0007048752499586497, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6794260.0, "repeat_count": 0.0, - "routers_loss": 0.0063498299568891525, + "routers_loss": 0.006246297620236874, "skip_count": 2.0, "step": 4212, "text_loss": 0.2430499643087387 @@ -40031,13 +40031,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.0419921875, "learning_rate": 0.00070459287048662, - "loss": 0.0074, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6797413.0, "repeat_count": 0.0, - "routers_loss": 0.001165185822173953, + "routers_loss": 0.0012964420020580292, "skip_count": 0.0, "step": 4214, "text_loss": 0.48889362812042236 @@ -40050,13 +40050,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.031494140625, "learning_rate": 0.0007043104126142163, - "loss": 0.0073, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6800815.0, "repeat_count": 0.0, - "routers_loss": 0.002119335113093257, + "routers_loss": 0.0018109704833477736, "skip_count": 0.0, "step": 4216, "text_loss": 0.5617026686668396 @@ -40069,13 +40069,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0225830078125, + "grad_norm": 0.0250244140625, "learning_rate": 0.0007040278764496771, - "loss": 0.0061, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6803937.0, "repeat_count": 2.0, - "routers_loss": 0.002939696190878749, + "routers_loss": 0.0028699536342173815, "skip_count": 1.0, "step": 4218, "text_loss": 0.548405647277832 @@ -40088,13 +40088,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.03857421875, "learning_rate": 0.0007037452621012708, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6806946.0, "repeat_count": 0.0, - "routers_loss": 0.0008340062922798097, + "routers_loss": 0.0007951617590151727, "skip_count": 0.0, "step": 4220, "text_loss": 0.5702725648880005 @@ -40107,13 +40107,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.030517578125, "learning_rate": 0.0007034625696772958, - "loss": 0.0053, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6810083.0, "repeat_count": 0.0, - "routers_loss": 0.003032320411875844, + "routers_loss": 0.003436052706092596, "skip_count": 2.0, "step": 4222, "text_loss": 0.3898725211620331 @@ -40126,13 +40126,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.03955078125, "learning_rate": 0.00070317979928608, - "loss": 0.007, + "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6812845.0, "repeat_count": 0.0, - "routers_loss": 0.0005332283908501267, + "routers_loss": 0.0005070401239208877, "skip_count": 0.0, "step": 4224, "text_loss": 0.5244157910346985 @@ -40145,13 +40145,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.0390625, "learning_rate": 0.000702896951035982, - "loss": 0.0103, + "loss": 0.0101, "macro_f1": 0.3272727429866791, "num_tokens": 6815801.0, "repeat_count": 0.0, - "routers_loss": 0.015828115865588188, + "routers_loss": 0.01560303382575512, "skip_count": 1.0, "step": 4226, "text_loss": 0.26503118872642517 @@ -40164,13 +40164,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03564453125, "learning_rate": 0.0007026140250353896, - "loss": 0.0088, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 6819464.0, "repeat_count": 0.0, - "routers_loss": 0.010141439735889435, + "routers_loss": 0.009310240857303143, "skip_count": 2.0, "step": 4228, "text_loss": 0.15597499907016754 @@ -40189,7 +40189,7 @@ "macro_f1": 0.3333333432674408, "num_tokens": 6822657.0, "repeat_count": 0.0, - "routers_loss": 0.004937903955578804, + "routers_loss": 0.005309136584401131, "skip_count": 0.0, "step": 4230, "text_loss": 0.5271651148796082 @@ -40202,13 +40202,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.046875, "learning_rate": 0.0007020479402164226, - "loss": 0.009, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 6825661.0, "repeat_count": 0.0, - "routers_loss": 0.005930901039391756, + "routers_loss": 0.005936166271567345, "skip_count": 2.0, "step": 4232, "text_loss": 0.6105108857154846 @@ -40221,13 +40221,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.040283203125, "learning_rate": 0.0007017647816149727, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6828688.0, "repeat_count": 0.0, - "routers_loss": 0.0015492573147639632, + "routers_loss": 0.001653556595556438, "skip_count": 0.0, "step": 4234, "text_loss": 0.6966437101364136 @@ -40240,13 +40240,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.032470703125, "learning_rate": 0.000701481545696878, - "loss": 0.0093, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 6831850.0, "repeat_count": 0.0, - "routers_loss": 0.001357862027361989, + "routers_loss": 0.0013501866487786174, "skip_count": 0.0, "step": 4236, "text_loss": 1.259678840637207 @@ -40259,13 +40259,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.059326171875, + "grad_norm": 0.059814453125, "learning_rate": 0.0007011982325706747, - "loss": 0.006, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6834862.0, "repeat_count": 0.0, - "routers_loss": 0.00899078231304884, + "routers_loss": 0.008970130234956741, "skip_count": 1.0, "step": 4238, "text_loss": 0.24906545877456665 @@ -40278,13 +40278,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.052490234375, + "grad_norm": 0.043212890625, "learning_rate": 0.0007009148423449292, - "loss": 0.0067, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6838148.0, "repeat_count": 0.0, - "routers_loss": 0.0027724208775907755, + "routers_loss": 0.0026013399474322796, "skip_count": 0.0, "step": 4240, "text_loss": 0.291467547416687 @@ -40297,13 +40297,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.037109375, "learning_rate": 0.0007006313751282371, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.3272727429866791, "num_tokens": 6841142.0, "repeat_count": 0.0, - "routers_loss": 0.0202134158462286, + "routers_loss": 0.021415632218122482, "skip_count": 1.0, "step": 4242, "text_loss": 0.507606029510498 @@ -40316,13 +40316,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0301513671875, + "grad_norm": 0.0289306640625, "learning_rate": 0.0007003478310292236, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6844042.0, "repeat_count": 0.0, - "routers_loss": 0.00252551375888288, + "routers_loss": 0.0023636550176888704, "skip_count": 0.0, "step": 4244, "text_loss": 0.11626995354890823 @@ -40335,13 +40335,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0303955078125, + "grad_norm": 0.03466796875, "learning_rate": 0.0007000642101565433, - "loss": 0.0081, + "loss": 0.008, "macro_f1": 0.3272727429866791, "num_tokens": 6847359.0, "repeat_count": 1.0, - "routers_loss": 0.022849632427096367, + "routers_loss": 0.025154776871204376, "skip_count": 0.0, "step": 4246, "text_loss": 0.42898693680763245 @@ -40354,13 +40354,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.038330078125, "learning_rate": 0.0006997805126188803, - "loss": 0.0055, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6850443.0, "repeat_count": 0.0, - "routers_loss": 0.005312036257237196, + "routers_loss": 0.00540317315608263, "skip_count": 0.0, "step": 4248, "text_loss": 0.18085283041000366 @@ -40373,13 +40373,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.038818359375, "learning_rate": 0.000699496738524948, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 6853495.0, "repeat_count": 0.0, - "routers_loss": 0.0015959764132276177, + "routers_loss": 0.0014433214673772454, "skip_count": 0.0, "step": 4250, "text_loss": 0.5524004697799683 @@ -40392,13 +40392,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.032470703125, "learning_rate": 0.0006992128879834891, - "loss": 0.0056, + "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 6856774.0, "repeat_count": 1.0, - "routers_loss": 0.01500304602086544, + "routers_loss": 0.013381492346525192, "skip_count": 3.0, "step": 4252, "text_loss": 0.19605717062950134 @@ -40411,13 +40411,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.04248046875, "learning_rate": 0.0006989289611032758, - "loss": 0.0096, + "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 6860313.0, "repeat_count": 0.0, - "routers_loss": 0.006884181406348944, + "routers_loss": 0.007140172645449638, "skip_count": 1.0, "step": 4254, "text_loss": 0.3182447552680969 @@ -40430,13 +40430,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.031982421875, "learning_rate": 0.0006986449579931091, - "loss": 0.0066, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6863683.0, "repeat_count": 0.0, - "routers_loss": 0.007357228547334671, + "routers_loss": 0.006486213766038418, "skip_count": 1.0, "step": 4256, "text_loss": 0.19250160455703735 @@ -40449,13 +40449,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.04248046875, "learning_rate": 0.0006983608787618201, - "loss": 0.0073, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 6867609.0, "repeat_count": 0.0, - "routers_loss": 0.0016412866534665227, + "routers_loss": 0.001465818495489657, "skip_count": 0.0, "step": 4258, "text_loss": 0.5912898182868958 @@ -40468,13 +40468,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04443359375, + "grad_norm": 0.04248046875, "learning_rate": 0.000698076723518268, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 6870040.0, "repeat_count": 0.0, - "routers_loss": 0.003204819979146123, + "routers_loss": 0.0031106441747397184, "skip_count": 0.0, "step": 4260, "text_loss": 0.13542121648788452 @@ -40487,13 +40487,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.02978515625, "learning_rate": 0.0006977924923713418, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6873441.0, "repeat_count": 0.0, - "routers_loss": 0.0005213851109147072, + "routers_loss": 0.0005377951893024147, "skip_count": 0.0, "step": 4262, "text_loss": 0.352464497089386 @@ -40506,13 +40506,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.045654296875, "learning_rate": 0.0006975081854299594, - "loss": 0.0093, + "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 6876637.0, "repeat_count": 0.0, - "routers_loss": 0.0067594959400594234, + "routers_loss": 0.007052485831081867, "skip_count": 0.0, "step": 4264, "text_loss": 0.5023844242095947 @@ -40525,13 +40525,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02197265625, + "grad_norm": 0.02294921875, "learning_rate": 0.0006972238028030678, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6879928.0, "repeat_count": 0.0, - "routers_loss": 0.0013809602241963148, + "routers_loss": 0.0013608322478830814, "skip_count": 0.0, "step": 4266, "text_loss": 0.8664718270301819 @@ -40544,13 +40544,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.0247802734375, "learning_rate": 0.0006969393445996429, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6883425.0, "repeat_count": 0.0, - "routers_loss": 0.0009357557282783091, + "routers_loss": 0.0007607188890688121, "skip_count": 0.0, "step": 4268, "text_loss": 0.5131992101669312 @@ -40563,13 +40563,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.0361328125, "learning_rate": 0.0006966548109286897, - "loss": 0.0079, + "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 6886790.0, "repeat_count": 0.0, - "routers_loss": 0.00034129369305446744, + "routers_loss": 0.00035804163780994713, "skip_count": 0.0, "step": 4270, "text_loss": 0.5352054834365845 @@ -40582,13 +40582,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.032470703125, "learning_rate": 0.000696370201899242, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6889747.0, "repeat_count": 0.0, - "routers_loss": 0.004583079367876053, + "routers_loss": 0.004451376851648092, "skip_count": 1.0, "step": 4272, "text_loss": 0.47865036129951477 @@ -40601,13 +40601,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.03271484375, "learning_rate": 0.0006960855176203623, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6892604.0, "repeat_count": 0.0, - "routers_loss": 0.0015929298242554069, + "routers_loss": 0.0015342880506068468, "skip_count": 0.0, "step": 4274, "text_loss": 0.36278650164604187 @@ -40620,13 +40620,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.024169921875, "learning_rate": 0.0006958007582011425, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6895563.0, "repeat_count": 0.0, - "routers_loss": 0.0021544951014220715, + "routers_loss": 0.0022974940948188305, "skip_count": 2.0, "step": 4276, "text_loss": 0.6695618629455566 @@ -40639,13 +40639,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.0361328125, "learning_rate": 0.0006955159237507027, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6898591.0, "repeat_count": 0.0, - "routers_loss": 0.008612595498561859, + "routers_loss": 0.00859096460044384, "skip_count": 1.0, "step": 4278, "text_loss": 0.44284722208976746 @@ -40658,13 +40658,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.033935546875, "learning_rate": 0.0006952310143781921, - "loss": 0.0056, + "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6903119.0, "repeat_count": 1.0, - "routers_loss": 0.00829319842159748, + "routers_loss": 0.007919861935079098, "skip_count": 3.0, "step": 4280, "text_loss": 0.5006136298179626 @@ -40677,13 +40677,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0277099609375, "learning_rate": 0.0006949460301927886, - "loss": 0.0046, + "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6906394.0, "repeat_count": 0.0, - "routers_loss": 0.0009446305921301246, + "routers_loss": 0.0008476210059598088, "skip_count": 0.0, "step": 4282, "text_loss": 0.8153555989265442 @@ -40696,13 +40696,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.048095703125, "learning_rate": 0.0006946609713036985, - "loss": 0.0082, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 6909136.0, "repeat_count": 0.0, - "routers_loss": 0.007239636033773422, + "routers_loss": 0.006711610127240419, "skip_count": 2.0, "step": 4284, "text_loss": 0.43136683106422424 @@ -40715,13 +40715,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0206298828125, + "grad_norm": 0.0185546875, "learning_rate": 0.0006943758378201571, - "loss": 0.0063, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 6912734.0, "repeat_count": 0.0, - "routers_loss": 0.003926573321223259, + "routers_loss": 0.0038677838165313005, "skip_count": 0.0, "step": 4286, "text_loss": 0.2693749964237213 @@ -40734,13 +40734,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0245361328125, + "grad_norm": 0.02783203125, "learning_rate": 0.0006940906298514278, - "loss": 0.0044, + "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6915838.0, "repeat_count": 0.0, - "routers_loss": 0.0012871087528765202, + "routers_loss": 0.0012188015971332788, "skip_count": 0.0, "step": 4288, "text_loss": 0.5809219479560852 @@ -40753,13 +40753,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025390625, + "grad_norm": 0.026123046875, "learning_rate": 0.0006938053475068031, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6919225.0, "repeat_count": 0.0, - "routers_loss": 0.0018292219610884786, + "routers_loss": 0.001955829095095396, "skip_count": 0.0, "step": 4290, "text_loss": 0.5116089582443237 @@ -40772,13 +40772,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.07275390625, + "grad_norm": 0.11279296875, "learning_rate": 0.0006935199908956037, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 6922495.0, "repeat_count": 1.0, - "routers_loss": 0.0036494603846222162, + "routers_loss": 0.0035709093790501356, "skip_count": 0.0, "step": 4292, "text_loss": 0.2745901644229889 @@ -40791,13 +40791,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025390625, + "grad_norm": 0.02587890625, "learning_rate": 0.0006932345601271786, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6925317.0, "repeat_count": 0.0, - "routers_loss": 0.0005956419045105577, + "routers_loss": 0.0005745319649577141, "skip_count": 0.0, "step": 4294, "text_loss": 0.6039219498634338 @@ -40810,13 +40810,13 @@ "f1_execute": 0.9743589162826538, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.052734375, + "grad_norm": 0.0693359375, "learning_rate": 0.0006929490553109056, - "loss": 0.0105, + "loss": 0.0107, "macro_f1": 0.9247862696647644, "num_tokens": 6928054.0, "repeat_count": 3.0, - "routers_loss": 0.05667201802134514, + "routers_loss": 0.061689916998147964, "skip_count": 6.0, "step": 4296, "text_loss": 0.3904837667942047 @@ -40829,13 +40829,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0234375, + "grad_norm": 0.0240478515625, "learning_rate": 0.0006926634765561907, - "loss": 0.0036, + "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6931348.0, "repeat_count": 0.0, - "routers_loss": 0.0017167082987725735, + "routers_loss": 0.002007248578593135, "skip_count": 0.0, "step": 4298, "text_loss": 0.5170742273330688 @@ -40848,13 +40848,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0302734375, "learning_rate": 0.000692377823972468, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6934411.0, "repeat_count": 0.0, - "routers_loss": 0.0005654593114741147, + "routers_loss": 0.0005786226247437298, "skip_count": 0.0, "step": 4300, "text_loss": 0.8032443523406982 @@ -40867,13 +40867,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.031982421875, "learning_rate": 0.0006920920976692004, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 6938153.0, "repeat_count": 1.0, - "routers_loss": 0.022815195843577385, + "routers_loss": 0.024602646008133888, "skip_count": 0.0, "step": 4302, "text_loss": 0.446534663438797 @@ -40892,7 +40892,7 @@ "macro_f1": 0.6666666865348816, "num_tokens": 6940731.0, "repeat_count": 0.0, - "routers_loss": 0.005607374478131533, + "routers_loss": 0.005759815219789743, "skip_count": 2.0, "step": 4304, "text_loss": 0.15479247272014618 @@ -40905,13 +40905,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.04150390625, "learning_rate": 0.0006915204243420214, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6943246.0, "repeat_count": 0.0, - "routers_loss": 0.005993676837533712, + "routers_loss": 0.005315347574651241, "skip_count": 1.0, "step": 4306, "text_loss": 0.22127842903137207 @@ -40924,13 +40924,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0225830078125, + "grad_norm": 0.0240478515625, "learning_rate": 0.0006912344775371765, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6947197.0, "repeat_count": 0.0, - "routers_loss": 0.0010728619527071714, + "routers_loss": 0.0012061651796102524, "skip_count": 0.0, "step": 4308, "text_loss": 0.7058854103088379 @@ -40943,13 +40943,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0303955078125, + "grad_norm": 0.0361328125, "learning_rate": 0.0006909484574509191, - "loss": 0.0068, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 6951817.0, "repeat_count": 0.0, - "routers_loss": 0.0027683766093105078, + "routers_loss": 0.0029203309677541256, "skip_count": 0.0, "step": 4310, "text_loss": 0.6014000773429871 @@ -40962,13 +40962,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0245361328125, "learning_rate": 0.0006906623641928525, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6955094.0, "repeat_count": 0.0, - "routers_loss": 0.006130238063633442, + "routers_loss": 0.005703397560864687, "skip_count": 2.0, "step": 4312, "text_loss": 0.5923848152160645 @@ -40981,13 +40981,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.053466796875, + "grad_norm": 0.08154296875, "learning_rate": 0.0006903761978726084, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 6958127.0, "repeat_count": 1.0, - "routers_loss": 0.005145471077412367, + "routers_loss": 0.004489895887672901, "skip_count": 2.0, "step": 4314, "text_loss": 0.36911651492118835 @@ -41000,13 +41000,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.023681640625, + "grad_norm": 0.0223388671875, "learning_rate": 0.000690089958599846, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6960871.0, "repeat_count": 0.0, - "routers_loss": 0.004196064081043005, + "routers_loss": 0.003871412482112646, "skip_count": 2.0, "step": 4316, "text_loss": 0.442545086145401 @@ -41019,13 +41019,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0294189453125, + "grad_norm": 0.0301513671875, "learning_rate": 0.000689803646484253, - "loss": 0.0059, + "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6963980.0, "repeat_count": 1.0, - "routers_loss": 0.007919433526694775, + "routers_loss": 0.008667866699397564, "skip_count": 2.0, "step": 4318, "text_loss": 0.1987489014863968 @@ -41038,13 +41038,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.030517578125, "learning_rate": 0.0006895172616355446, - "loss": 0.0067, + "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6967132.0, "repeat_count": 1.0, - "routers_loss": 0.008535753935575485, + "routers_loss": 0.00843339879065752, "skip_count": 0.0, "step": 4320, "text_loss": 0.48267918825149536 @@ -41057,13 +41057,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.045654296875, + "grad_norm": 0.042236328125, "learning_rate": 0.0006892308041634639, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 6969971.0, "repeat_count": 0.0, - "routers_loss": 0.00036565042682923377, + "routers_loss": 0.0004312851815484464, "skip_count": 0.0, "step": 4322, "text_loss": 0.3662732243537903 @@ -41076,13 +41076,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.034912109375, "learning_rate": 0.0006889442741777822, - "loss": 0.006, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6973114.0, "repeat_count": 0.0, - "routers_loss": 0.004728913307189941, + "routers_loss": 0.004588035400956869, "skip_count": 3.0, "step": 4324, "text_loss": 0.6707104444503784 @@ -41095,13 +41095,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.047607421875, "learning_rate": 0.0006886576717882982, "loss": 0.0057, "macro_f1": 0.8817967176437378, "num_tokens": 6976013.0, "repeat_count": 2.0, - "routers_loss": 0.06778892129659653, + "routers_loss": 0.0687296912074089, "skip_count": 3.0, "step": 4326, "text_loss": 0.1662217676639557 @@ -41114,13 +41114,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.034912109375, "learning_rate": 0.0006883709971048384, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6979200.0, "repeat_count": 0.0, - "routers_loss": 0.0030250558629631996, + "routers_loss": 0.002950174268335104, "skip_count": 0.0, "step": 4328, "text_loss": 0.21168152987957 @@ -41133,13 +41133,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02294921875, + "grad_norm": 0.031494140625, "learning_rate": 0.0006880842502372572, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 6982640.0, "repeat_count": 0.0, - "routers_loss": 0.0033437241800129414, + "routers_loss": 0.0032158740796148777, "skip_count": 0.0, "step": 4330, "text_loss": 0.26790961623191833 @@ -41152,13 +41152,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.026611328125, "learning_rate": 0.0006877974312954365, - "loss": 0.0079, + "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 6985917.0, "repeat_count": 0.0, - "routers_loss": 0.0005326211685314775, + "routers_loss": 0.0005083635332994163, "skip_count": 0.0, "step": 4332, "text_loss": 0.9736502170562744 @@ -41177,7 +41177,7 @@ "macro_f1": 0.32098764181137085, "num_tokens": 6988388.0, "repeat_count": 0.0, - "routers_loss": 0.034170545637607574, + "routers_loss": 0.03473830223083496, "skip_count": 2.0, "step": 4334, "text_loss": 0.21662230789661407 @@ -41190,13 +41190,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.03857421875, "learning_rate": 0.0006872235776287425, - "loss": 0.0092, + "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 6991360.0, "repeat_count": 0.0, - "routers_loss": 0.002001045737415552, + "routers_loss": 0.002206524135544896, "skip_count": 0.0, "step": 4336, "text_loss": 0.6026972532272339 @@ -41209,13 +41209,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.038330078125, "learning_rate": 0.0006869365431237711, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6995080.0, "repeat_count": 1.0, - "routers_loss": 0.0009856362594291568, + "routers_loss": 0.000969731598161161, "skip_count": 0.0, "step": 4338, "text_loss": 0.5833017230033875 @@ -41228,13 +41228,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0281982421875, "learning_rate": 0.0006866494369843635, "loss": 0.0054, "macro_f1": 0.8820862174034119, "num_tokens": 6998526.0, "repeat_count": 2.0, - "routers_loss": 0.013545103371143341, + "routers_loss": 0.013962293043732643, "skip_count": 2.0, "step": 4340, "text_loss": 0.41465985774993896 @@ -41247,13 +41247,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.0294189453125, "learning_rate": 0.0006863622593205397, - "loss": 0.0049, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7001494.0, "repeat_count": 0.0, - "routers_loss": 0.006991719361394644, + "routers_loss": 0.0064964210614562035, "skip_count": 3.0, "step": 4342, "text_loss": 0.3774271011352539 @@ -41266,13 +41266,13 @@ "f1_execute": 0.9767441749572754, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.03369140625, "learning_rate": 0.0006860750102423464, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.6589147448539734, "num_tokens": 7005544.0, "repeat_count": 1.0, - "routers_loss": 0.02598598413169384, + "routers_loss": 0.023250726982951164, "skip_count": 6.0, "step": 4344, "text_loss": 0.2732464373111725 @@ -41285,13 +41285,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0250244140625, "learning_rate": 0.0006857876898598582, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 7008847.0, "repeat_count": 0.0, - "routers_loss": 0.0039848871529102325, + "routers_loss": 0.0038170060142874718, "skip_count": 2.0, "step": 4346, "text_loss": 0.29610875248908997 @@ -41306,11 +41306,11 @@ "f1_skip": 0.0, "grad_norm": 0.0303955078125, "learning_rate": 0.0006855002982831769, - "loss": 0.0074, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7012577.0, "repeat_count": 0.0, - "routers_loss": 0.0012735783820971847, + "routers_loss": 0.0012856025714427233, "skip_count": 0.0, "step": 4348, "text_loss": 0.6098502278327942 @@ -41323,13 +41323,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.061767578125, "learning_rate": 0.0006852128356224314, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7015650.0, "repeat_count": 0.0, - "routers_loss": 0.00863664597272873, + "routers_loss": 0.008162742480635643, "skip_count": 1.0, "step": 4350, "text_loss": 0.20868146419525146 @@ -41342,13 +41342,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.023681640625, + "grad_norm": 0.023193359375, "learning_rate": 0.0006849253019877778, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.8817967176437378, "num_tokens": 7019925.0, "repeat_count": 2.0, - "routers_loss": 0.023779816925525665, + "routers_loss": 0.023544032126665115, "skip_count": 3.0, "step": 4352, "text_loss": 0.628226101398468 @@ -41361,13 +41361,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0517578125, + "grad_norm": 0.06298828125, "learning_rate": 0.0006846376974893996, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 7023130.0, "repeat_count": 0.0, - "routers_loss": 0.004940718412399292, + "routers_loss": 0.004982319660484791, "skip_count": 2.0, "step": 4354, "text_loss": 0.7037544250488281 @@ -41380,13 +41380,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0673828125, + "grad_norm": 0.0654296875, "learning_rate": 0.0006843500222375074, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7026422.0, "repeat_count": 1.0, - "routers_loss": 0.004191596060991287, + "routers_loss": 0.004015266429632902, "skip_count": 0.0, "step": 4356, "text_loss": 0.22352729737758636 @@ -41399,13 +41399,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.04150390625, + "grad_norm": 0.042724609375, "learning_rate": 0.0006840622763423391, "loss": 0.0071, "macro_f1": 0.9449735879898071, "num_tokens": 7029077.0, "repeat_count": 2.0, - "routers_loss": 0.019883066415786743, + "routers_loss": 0.021162014454603195, "skip_count": 4.0, "step": 4358, "text_loss": 0.2431403249502182 @@ -41418,13 +41418,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.03662109375, "learning_rate": 0.0006837744599141591, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7032582.0, "repeat_count": 0.0, - "routers_loss": 0.0007547057466581464, + "routers_loss": 0.0007044129306450486, "skip_count": 0.0, "step": 4360, "text_loss": 0.26667487621307373 @@ -41437,13 +41437,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.04052734375, "learning_rate": 0.0006834865730632594, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7035642.0, "repeat_count": 0.0, - "routers_loss": 0.0069348798133432865, + "routers_loss": 0.0067853196524083614, "skip_count": 1.0, "step": 4362, "text_loss": 0.20965275168418884 @@ -41456,13 +41456,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.0281982421875, "learning_rate": 0.0006831986158999588, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7038601.0, "repeat_count": 0.0, - "routers_loss": 0.008647902868688107, + "routers_loss": 0.00899333506822586, "skip_count": 2.0, "step": 4364, "text_loss": 0.26860126852989197 @@ -41475,13 +41475,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.039794921875, "learning_rate": 0.000682910588534603, - "loss": 0.0089, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7042274.0, "repeat_count": 0.0, - "routers_loss": 0.0019517095061019063, + "routers_loss": 0.0019194348715245724, "skip_count": 0.0, "step": 4366, "text_loss": 0.14046810567378998 @@ -41494,13 +41494,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03125, + "grad_norm": 0.034423828125, "learning_rate": 0.0006826224910775647, - "loss": 0.006, + "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7045268.0, "repeat_count": 1.0, - "routers_loss": 0.007441094610840082, + "routers_loss": 0.006915684789419174, "skip_count": 3.0, "step": 4368, "text_loss": 0.5900366306304932 @@ -41513,13 +41513,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.0400390625, "learning_rate": 0.0006823343236392432, - "loss": 0.0072, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7049407.0, "repeat_count": 0.0, - "routers_loss": 0.00144639378413558, + "routers_loss": 0.001678116386756301, "skip_count": 0.0, "step": 4370, "text_loss": 0.7868026494979858 @@ -41532,13 +41532,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0274658203125, "learning_rate": 0.000682046086330065, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7052783.0, "repeat_count": 0.0, - "routers_loss": 0.0003659129433799535, + "routers_loss": 0.0003459530707914382, "skip_count": 0.0, "step": 4372, "text_loss": 0.6349637508392334 @@ -41551,13 +41551,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.0279541015625, "learning_rate": 0.0006817577792604831, - "loss": 0.0052, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7055757.0, "repeat_count": 0.0, - "routers_loss": 0.0012188151013106108, + "routers_loss": 0.0011729507241398096, "skip_count": 0.0, "step": 4374, "text_loss": 0.43258991837501526 @@ -41570,13 +41570,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.03564453125, "learning_rate": 0.0006814694025409773, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 7058684.0, "repeat_count": 0.0, - "routers_loss": 0.0006639147759415209, + "routers_loss": 0.0006664610700681806, "skip_count": 0.0, "step": 4376, "text_loss": 0.5307940244674683 @@ -41589,13 +41589,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.091796875, "learning_rate": 0.0006811809562820542, - "loss": 0.0081, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7061902.0, "repeat_count": 0.0, - "routers_loss": 0.004041146486997604, + "routers_loss": 0.004595907870680094, "skip_count": 2.0, "step": 4378, "text_loss": 0.5830042362213135 @@ -41608,13 +41608,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027587890625, + "grad_norm": 0.0274658203125, "learning_rate": 0.0006808924405942467, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7065100.0, "repeat_count": 0.0, - "routers_loss": 0.0028930313419550657, + "routers_loss": 0.0032026609405875206, "skip_count": 0.0, "step": 4380, "text_loss": 0.20797798037528992 @@ -41627,13 +41627,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0186767578125, + "grad_norm": 0.0184326171875, "learning_rate": 0.0006806038555881148, - "loss": 0.0041, + "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 7068556.0, "repeat_count": 1.0, - "routers_loss": 0.0027319532819092274, + "routers_loss": 0.0024626904632896185, "skip_count": 0.0, "step": 4382, "text_loss": 0.5791074633598328 @@ -41646,13 +41646,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.041015625, + "grad_norm": 0.040283203125, "learning_rate": 0.0006803152013742448, - "loss": 0.0077, + "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 7071284.0, "repeat_count": 1.0, - "routers_loss": 0.011207868345081806, + "routers_loss": 0.010723610408604145, "skip_count": 2.0, "step": 4384, "text_loss": 0.13227243721485138 @@ -41665,13 +41665,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.029052734375, "learning_rate": 0.0006800264780632495, - "loss": 0.0054, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7074428.0, "repeat_count": 1.0, - "routers_loss": 0.001005658763460815, + "routers_loss": 0.0011231007520109415, "skip_count": 0.0, "step": 4386, "text_loss": 0.4360627233982086 @@ -41684,13 +41684,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0296630859375, + "grad_norm": 0.0291748046875, "learning_rate": 0.0006797376857657681, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 7078313.0, "repeat_count": 2.0, - "routers_loss": 0.00910002738237381, + "routers_loss": 0.008419238030910492, "skip_count": 1.0, "step": 4388, "text_loss": 0.5183924436569214 @@ -41703,13 +41703,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04931640625, + "grad_norm": 0.046142578125, "learning_rate": 0.0006794488245924664, - "loss": 0.0085, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7081258.0, "repeat_count": 1.0, - "routers_loss": 0.0076475366950035095, + "routers_loss": 0.006582668516784906, "skip_count": 3.0, "step": 4390, "text_loss": 0.2797473669052124 @@ -41722,13 +41722,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.046630859375, "learning_rate": 0.0006791598946540368, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7084527.0, "repeat_count": 0.0, - "routers_loss": 0.005813235882669687, + "routers_loss": 0.00557357631623745, "skip_count": 2.0, "step": 4392, "text_loss": 0.39495575428009033 @@ -41741,13 +41741,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.06005859375, "learning_rate": 0.0006788708960611975, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7087675.0, "repeat_count": 0.0, - "routers_loss": 0.007134446874260902, + "routers_loss": 0.007155992556363344, "skip_count": 0.0, "step": 4394, "text_loss": 0.3785299062728882 @@ -41760,13 +41760,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0172119140625, + "grad_norm": 0.01806640625, "learning_rate": 0.0006785818289246934, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7090171.0, "repeat_count": 0.0, - "routers_loss": 0.0008882717229425907, + "routers_loss": 0.0009265039698220789, "skip_count": 0.0, "step": 4396, "text_loss": 0.42634522914886475 @@ -41779,13 +41779,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.046142578125, "learning_rate": 0.0006782926933552955, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 7092529.0, "repeat_count": 1.0, - "routers_loss": 0.008333612233400345, + "routers_loss": 0.008679097518324852, "skip_count": 7.0, "step": 4398, "text_loss": 0.4283660054206848 @@ -41798,13 +41798,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.042724609375, "learning_rate": 0.0006780034894638014, - "loss": 0.006, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7095141.0, "repeat_count": 0.0, - "routers_loss": 0.0026251052040606737, + "routers_loss": 0.002363949315622449, "skip_count": 0.0, "step": 4400, "text_loss": 0.481539249420166 @@ -41817,13 +41817,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.040283203125, "learning_rate": 0.000677714217361034, - "loss": 0.0055, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7098208.0, "repeat_count": 0.0, - "routers_loss": 0.003755744779482484, + "routers_loss": 0.004005146212875843, "skip_count": 3.0, "step": 4402, "text_loss": 0.6443291902542114 @@ -41836,13 +41836,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.0306396484375, "learning_rate": 0.0006774248771578435, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7101681.0, "repeat_count": 0.0, - "routers_loss": 0.0028396346606314182, + "routers_loss": 0.0026864963583648205, "skip_count": 0.0, "step": 4404, "text_loss": 0.16315312683582306 @@ -41855,13 +41855,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.033447265625, + "grad_norm": 0.0322265625, "learning_rate": 0.0006771354689651054, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.9449735879898071, "num_tokens": 7104719.0, "repeat_count": 2.0, - "routers_loss": 0.02745615690946579, + "routers_loss": 0.02719845622777939, "skip_count": 4.0, "step": 4406, "text_loss": 0.37855592370033264 @@ -41874,13 +41874,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.0284423828125, "learning_rate": 0.0006768459928937213, - "loss": 0.005, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7108697.0, "repeat_count": 0.0, - "routers_loss": 0.010080067440867424, + "routers_loss": 0.010488593950867653, "skip_count": 0.0, "step": 4408, "text_loss": 0.23133711516857147 @@ -41893,13 +41893,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0247802734375, + "grad_norm": 0.02392578125, "learning_rate": 0.0006765564490546193, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7111426.0, "repeat_count": 1.0, - "routers_loss": 0.001801682054065168, + "routers_loss": 0.0013637891970574856, "skip_count": 0.0, "step": 4410, "text_loss": 0.41399383544921875 @@ -41912,13 +41912,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.0732421875, "learning_rate": 0.0006762668375587528, - "loss": 0.0068, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7114241.0, "repeat_count": 0.0, - "routers_loss": 0.0009764294954948127, + "routers_loss": 0.000900395680218935, "skip_count": 0.0, "step": 4412, "text_loss": 0.6460412740707397 @@ -41931,13 +41931,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.0498046875, "learning_rate": 0.0006759771585171016, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7117031.0, "repeat_count": 0.0, - "routers_loss": 0.002657619072124362, + "routers_loss": 0.0024001260753721, "skip_count": 0.0, "step": 4414, "text_loss": 0.7645824551582336 @@ -41950,13 +41950,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0303955078125, + "grad_norm": 0.0306396484375, "learning_rate": 0.0006756874120406714, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 7120766.0, "repeat_count": 3.0, - "routers_loss": 0.005801939871162176, + "routers_loss": 0.005034091416746378, "skip_count": 4.0, "step": 4416, "text_loss": 0.31753066182136536 @@ -41969,13 +41969,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.033447265625, "learning_rate": 0.0006753975982404934, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7125243.0, "repeat_count": 0.0, - "routers_loss": 0.0026111488696187735, + "routers_loss": 0.002483269665390253, "skip_count": 0.0, "step": 4418, "text_loss": 0.5304268002510071 @@ -41988,13 +41988,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.027099609375, "learning_rate": 0.0006751077172276249, - "loss": 0.0055, + "loss": 0.0052, "macro_f1": 0.3272727429866791, "num_tokens": 7127795.0, "repeat_count": 0.0, - "routers_loss": 0.028494317084550858, + "routers_loss": 0.02676006779074669, "skip_count": 1.0, "step": 4420, "text_loss": 0.22011354565620422 @@ -42007,13 +42007,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.06396484375, + "grad_norm": 0.06201171875, "learning_rate": 0.000674817769113149, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7130837.0, "repeat_count": 0.0, - "routers_loss": 0.003031681990250945, + "routers_loss": 0.003267093561589718, "skip_count": 2.0, "step": 4422, "text_loss": 0.2906076908111572 @@ -42026,13 +42026,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0284423828125, + "grad_norm": 0.027099609375, "learning_rate": 0.000674527754008174, - "loss": 0.0047, + "loss": 0.0045, "macro_f1": 0.5934640765190125, "num_tokens": 7135090.0, "repeat_count": 0.0, - "routers_loss": 0.023750508204102516, + "routers_loss": 0.022510390728712082, "skip_count": 3.0, "step": 4424, "text_loss": 0.2544902563095093 @@ -42045,13 +42045,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.03662109375, "learning_rate": 0.0006742376720238345, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 7138751.0, "repeat_count": 0.0, - "routers_loss": 0.0012101450702175498, + "routers_loss": 0.0011178571730852127, "skip_count": 0.0, "step": 4426, "text_loss": 0.6811438798904419 @@ -42064,13 +42064,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0216064453125, + "grad_norm": 0.0238037109375, "learning_rate": 0.0006739475232712904, - "loss": 0.0035, + "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7141762.0, "repeat_count": 2.0, - "routers_loss": 0.005393387749791145, + "routers_loss": 0.005595206283032894, "skip_count": 1.0, "step": 4428, "text_loss": 0.38743990659713745 @@ -42083,13 +42083,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.033203125, "learning_rate": 0.0006736573078617272, - "loss": 0.0066, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7145235.0, "repeat_count": 0.0, - "routers_loss": 0.0029694747645407915, + "routers_loss": 0.002793942578136921, "skip_count": 2.0, "step": 4430, "text_loss": 0.21894219517707825 @@ -42102,13 +42102,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.048828125, "learning_rate": 0.0006733670259063561, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7149042.0, "repeat_count": 0.0, - "routers_loss": 0.006469822954386473, + "routers_loss": 0.006146818865090609, "skip_count": 3.0, "step": 4432, "text_loss": 0.17822015285491943 @@ -42121,13 +42121,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.032958984375, + "grad_norm": 0.042236328125, "learning_rate": 0.0006730766775164136, - "loss": 0.0059, + "loss": 0.0061, "macro_f1": 0.5492662787437439, "num_tokens": 7152166.0, "repeat_count": 0.0, - "routers_loss": 0.026202494278550148, + "routers_loss": 0.026045087724924088, "skip_count": 2.0, "step": 4434, "text_loss": 0.2910420000553131 @@ -42140,13 +42140,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0308837890625, + "grad_norm": 0.03466796875, "learning_rate": 0.0006727862628031618, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7155506.0, "repeat_count": 2.0, - "routers_loss": 0.002748608123511076, + "routers_loss": 0.0022973387967795134, "skip_count": 0.0, "step": 4436, "text_loss": 0.3502544164657593 @@ -42159,13 +42159,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.021484375, + "grad_norm": 0.022705078125, "learning_rate": 0.0006724957818778882, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7158739.0, "repeat_count": 0.0, - "routers_loss": 0.002528413198888302, + "routers_loss": 0.002357073128223419, "skip_count": 1.0, "step": 4438, "text_loss": 0.26200664043426514 @@ -42178,13 +42178,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0279541015625, + "grad_norm": 0.0277099609375, "learning_rate": 0.0006722052348519054, - "loss": 0.0095, + "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 7161776.0, "repeat_count": 0.0, - "routers_loss": 0.0005445044371299446, + "routers_loss": 0.0005521026905626059, "skip_count": 0.0, "step": 4440, "text_loss": 0.3922915458679199 @@ -42197,13 +42197,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04296875, + "grad_norm": 0.044189453125, "learning_rate": 0.000671914621836552, - "loss": 0.0108, + "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 7164763.0, "repeat_count": 0.0, - "routers_loss": 0.008194289170205593, + "routers_loss": 0.007691344246268272, "skip_count": 2.0, "step": 4442, "text_loss": 0.6021351218223572 @@ -42216,13 +42216,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0308837890625, + "grad_norm": 0.0322265625, "learning_rate": 0.000671623942943191, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7167924.0, "repeat_count": 0.0, - "routers_loss": 0.0033410112373530865, + "routers_loss": 0.0032181134447455406, "skip_count": 0.0, "step": 4444, "text_loss": 0.23639555275440216 @@ -42235,13 +42235,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.030029296875, "learning_rate": 0.0006713331982832113, - "loss": 0.0073, + "loss": 0.0071, "macro_f1": 0.3272727429866791, "num_tokens": 7170743.0, "repeat_count": 1.0, - "routers_loss": 0.024495115503668785, + "routers_loss": 0.024979131296277046, "skip_count": 0.0, "step": 4446, "text_loss": 0.4957772493362427 @@ -42254,13 +42254,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041748046875, + "grad_norm": 0.043212890625, "learning_rate": 0.0006710423879680271, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7174660.0, "repeat_count": 0.0, - "routers_loss": 0.0026295294519513845, + "routers_loss": 0.002571308286860585, "skip_count": 0.0, "step": 4448, "text_loss": 0.47968071699142456 @@ -42273,13 +42273,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0308837890625, + "grad_norm": 0.031494140625, "learning_rate": 0.000670751512109077, - "loss": 0.0063, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7177965.0, "repeat_count": 0.0, - "routers_loss": 0.0024814927019178867, + "routers_loss": 0.00212799571454525, "skip_count": 0.0, "step": 4450, "text_loss": 0.6550716161727905 @@ -42292,13 +42292,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.041748046875, "learning_rate": 0.0006704605708178252, - "loss": 0.0105, + "loss": 0.0107, "macro_f1": 0.6666666865348816, "num_tokens": 7181512.0, "repeat_count": 0.0, - "routers_loss": 0.004174043424427509, + "routers_loss": 0.004176430404186249, "skip_count": 1.0, "step": 4452, "text_loss": 0.36959558725357056 @@ -42311,13 +42311,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.024658203125, "learning_rate": 0.0006701695642057613, - "loss": 0.005, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7184555.0, "repeat_count": 0.0, - "routers_loss": 0.001206343644298613, + "routers_loss": 0.0010968588758260012, "skip_count": 0.0, "step": 4454, "text_loss": 0.6686749458312988 @@ -42330,13 +42330,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.037353515625, "learning_rate": 0.0006698784923843993, - "loss": 0.0077, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7187474.0, "repeat_count": 0.0, - "routers_loss": 0.001408674637787044, + "routers_loss": 0.0014241471653804183, "skip_count": 0.0, "step": 4456, "text_loss": 0.6147221922874451 @@ -42349,13 +42349,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0306396484375, "learning_rate": 0.0006695873554652784, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7190649.0, "repeat_count": 0.0, - "routers_loss": 0.008512571454048157, + "routers_loss": 0.008801907300949097, "skip_count": 0.0, "step": 4458, "text_loss": 0.26381927728652954 @@ -42368,13 +42368,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.048095703125, + "grad_norm": 0.04638671875, "learning_rate": 0.0006692961535599634, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 7193961.0, "repeat_count": 0.0, - "routers_loss": 0.009439903311431408, + "routers_loss": 0.009027508087456226, "skip_count": 1.0, "step": 4460, "text_loss": 0.1926470547914505 @@ -42387,13 +42387,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.03662109375, "learning_rate": 0.0006690048867800427, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7197456.0, "repeat_count": 0.0, - "routers_loss": 0.002294899197295308, + "routers_loss": 0.0022697453387081623, "skip_count": 0.0, "step": 4462, "text_loss": 0.6736721992492676 @@ -42406,13 +42406,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0238037109375, "learning_rate": 0.0006687135552371305, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7200290.0, "repeat_count": 0.0, - "routers_loss": 0.006510137114673853, + "routers_loss": 0.006747903767973185, "skip_count": 1.0, "step": 4464, "text_loss": 0.2026437371969223 @@ -42427,11 +42427,11 @@ "f1_skip": 0.0, "grad_norm": 0.032470703125, "learning_rate": 0.0006684221590428657, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7203320.0, "repeat_count": 0.0, - "routers_loss": 0.0010735326213762164, + "routers_loss": 0.0011565096210688353, "skip_count": 0.0, "step": 4466, "text_loss": 0.7587730288505554 @@ -42444,13 +42444,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.052734375, + "grad_norm": 0.058349609375, "learning_rate": 0.0006681306983089121, - "loss": 0.0084, + "loss": 0.0083, "macro_f1": 0.8820862174034119, "num_tokens": 7206411.0, "repeat_count": 2.0, - "routers_loss": 0.02467990666627884, + "routers_loss": 0.023645581677556038, "skip_count": 2.0, "step": 4468, "text_loss": 0.8981561660766602 @@ -42463,13 +42463,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0380859375, + "grad_norm": 0.0361328125, "learning_rate": 0.0006678391731469575, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7209421.0, "repeat_count": 0.0, - "routers_loss": 0.0035951859317719936, + "routers_loss": 0.0035848666448146105, "skip_count": 0.0, "step": 4470, "text_loss": 0.1522839516401291 @@ -42482,13 +42482,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0311279296875, + "grad_norm": 0.03173828125, "learning_rate": 0.0006675475836687152, - "loss": 0.007, + "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 7212267.0, "repeat_count": 1.0, - "routers_loss": 0.004971543326973915, + "routers_loss": 0.005046425387263298, "skip_count": 1.0, "step": 4472, "text_loss": 0.46007999777793884 @@ -42501,13 +42501,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.02685546875, "learning_rate": 0.0006672559299859228, - "loss": 0.0062, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7215195.0, "repeat_count": 0.0, - "routers_loss": 0.002104618586599827, + "routers_loss": 0.0019333874806761742, "skip_count": 0.0, "step": 4474, "text_loss": 1.0859547853469849 @@ -42520,13 +42520,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.040283203125, "learning_rate": 0.0006669642122103423, - "loss": 0.0045, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7217941.0, "repeat_count": 0.0, - "routers_loss": 0.0005096147069707513, + "routers_loss": 0.0005401032394729555, "skip_count": 0.0, "step": 4476, "text_loss": 0.9754356145858765 @@ -42545,7 +42545,7 @@ "macro_f1": 0.3272727429866791, "num_tokens": 7222494.0, "repeat_count": 1.0, - "routers_loss": 0.016167305409908295, + "routers_loss": 0.015569722279906273, "skip_count": 0.0, "step": 4478, "text_loss": 0.2896423637866974 @@ -42558,13 +42558,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0322265625, "learning_rate": 0.0006663805848279898, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7225292.0, "repeat_count": 0.0, - "routers_loss": 0.0021911219228059053, + "routers_loss": 0.0020135147497057915, "skip_count": 0.0, "step": 4480, "text_loss": 0.8492724299430847 @@ -42577,13 +42577,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0191650390625, + "grad_norm": 0.0194091796875, "learning_rate": 0.0006660886754448648, - "loss": 0.006, + "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 7229184.0, "repeat_count": 1.0, - "routers_loss": 0.002788309706375003, + "routers_loss": 0.002355351345613599, "skip_count": 0.0, "step": 4482, "text_loss": 0.189764603972435 @@ -42596,13 +42596,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0201416015625, + "grad_norm": 0.02099609375, "learning_rate": 0.0006657967024162459, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7232906.0, "repeat_count": 0.0, - "routers_loss": 0.003091001184657216, + "routers_loss": 0.003044391982257366, "skip_count": 0.0, "step": 4484, "text_loss": 0.4239847660064697 @@ -42615,13 +42615,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0260009765625, + "grad_norm": 0.0269775390625, "learning_rate": 0.0006655046658540179, - "loss": 0.0048, + "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7235996.0, "repeat_count": 0.0, - "routers_loss": 0.006288980133831501, + "routers_loss": 0.00602696230635047, "skip_count": 2.0, "step": 4486, "text_loss": 0.217103973031044 @@ -42634,13 +42634,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0179443359375, + "grad_norm": 0.0169677734375, "learning_rate": 0.0006652125658700896, - "loss": 0.0032, + "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 7238882.0, "repeat_count": 0.0, - "routers_loss": 0.0017161039868369699, + "routers_loss": 0.001470155781134963, "skip_count": 1.0, "step": 4488, "text_loss": 0.6090770363807678 @@ -42653,13 +42653,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.03759765625, "learning_rate": 0.0006649204025763945, - "loss": 0.0057, + "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7241815.0, "repeat_count": 1.0, - "routers_loss": 0.008624191395938396, + "routers_loss": 0.008737480267882347, "skip_count": 2.0, "step": 4490, "text_loss": 0.48314425349235535 @@ -42672,13 +42672,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.017333984375, + "grad_norm": 0.0177001953125, "learning_rate": 0.0006646281760848902, - "loss": 0.004, + "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7244848.0, "repeat_count": 0.0, - "routers_loss": 0.00083601736696437, + "routers_loss": 0.0008257135050371289, "skip_count": 0.0, "step": 4492, "text_loss": 0.5884748101234436 @@ -42693,11 +42693,11 @@ "f1_skip": 0.0, "grad_norm": 0.0228271484375, "learning_rate": 0.0006643358865075581, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7247930.0, "repeat_count": 0.0, - "routers_loss": 0.0016617088112980127, + "routers_loss": 0.0016262239078059793, "skip_count": 0.0, "step": 4494, "text_loss": 0.21444730460643768 @@ -42710,13 +42710,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.0299072265625, "learning_rate": 0.0006640435339564042, - "loss": 0.0075, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7251776.0, "repeat_count": 0.0, - "routers_loss": 0.001377894077450037, + "routers_loss": 0.001315156347118318, "skip_count": 0.0, "step": 4496, "text_loss": 0.6890370845794678 @@ -42729,13 +42729,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.035400390625, "learning_rate": 0.0006637511185434588, "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7255070.0, "repeat_count": 1.0, - "routers_loss": 0.007681882940232754, + "routers_loss": 0.007614497095346451, "skip_count": 3.0, "step": 4498, "text_loss": 0.516417920589447 @@ -42748,13 +42748,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0245361328125, + "grad_norm": 0.0238037109375, "learning_rate": 0.0006634586403807758, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7258115.0, "repeat_count": 3.0, - "routers_loss": 0.0049721370451152325, + "routers_loss": 0.004906686954200268, "skip_count": 2.0, "step": 4500, "text_loss": 0.577463686466217 @@ -42767,13 +42767,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05859375, + "grad_norm": 0.0927734375, "learning_rate": 0.0006631660995804334, "loss": 0.0067, "macro_f1": 0.6601307392120361, "num_tokens": 7260769.0, "repeat_count": 1.0, - "routers_loss": 0.01382436417043209, + "routers_loss": 0.013337121345102787, "skip_count": 2.0, "step": 4502, "text_loss": 0.37124839425086975 @@ -42786,13 +42786,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0419921875, + "grad_norm": 0.05224609375, "learning_rate": 0.0006628734962545339, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7263908.0, "repeat_count": 0.0, - "routers_loss": 0.0024676774628460407, + "routers_loss": 0.0023418180644512177, "skip_count": 0.0, "step": 4504, "text_loss": 0.17937727272510529 @@ -42805,13 +42805,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.040771484375, "learning_rate": 0.0006625808305152033, - "loss": 0.0067, + "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7267391.0, "repeat_count": 0.0, - "routers_loss": 0.0006373177748173475, + "routers_loss": 0.0006556165171787143, "skip_count": 0.0, "step": 4506, "text_loss": 0.45344987511634827 @@ -42824,13 +42824,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.02734375, "learning_rate": 0.0006622881024745919, - "loss": 0.0044, + "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7271402.0, "repeat_count": 0.0, - "routers_loss": 0.002280580811202526, + "routers_loss": 0.0021988123189657927, "skip_count": 0.0, "step": 4508, "text_loss": 0.5842905640602112 @@ -42843,13 +42843,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.029052734375, "learning_rate": 0.0006619953122448734, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7274354.0, "repeat_count": 0.0, - "routers_loss": 0.007782169617712498, + "routers_loss": 0.00774174090474844, "skip_count": 2.0, "step": 4510, "text_loss": 0.27159228920936584 @@ -42862,13 +42862,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03955078125, + "grad_norm": 0.038818359375, "learning_rate": 0.0006617024599382456, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7277378.0, "repeat_count": 0.0, - "routers_loss": 0.0007145124254748225, + "routers_loss": 0.0006942499312572181, "skip_count": 0.0, "step": 4512, "text_loss": 0.4464176297187805 @@ -42881,13 +42881,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.039794921875, "learning_rate": 0.0006614095456669302, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7280526.0, "repeat_count": 0.0, - "routers_loss": 0.0031456330325454473, + "routers_loss": 0.003003394464030862, "skip_count": 0.0, "step": 4514, "text_loss": 0.31188079714775085 @@ -42900,13 +42900,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.028076171875, "learning_rate": 0.0006611165695431725, - "loss": 0.0067, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7283916.0, "repeat_count": 0.0, - "routers_loss": 0.000815888459328562, + "routers_loss": 0.0006948060472495854, "skip_count": 0.0, "step": 4516, "text_loss": 0.5266574025154114 @@ -42919,13 +42919,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.042236328125, "learning_rate": 0.0006608235316792413, - "loss": 0.0064, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7286843.0, "repeat_count": 0.0, - "routers_loss": 0.0015030937502160668, + "routers_loss": 0.0014080886030569673, "skip_count": 0.0, "step": 4518, "text_loss": 0.5880120396614075 @@ -42938,13 +42938,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.03173828125, "learning_rate": 0.0006605304321874295, - "loss": 0.007, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7289940.0, "repeat_count": 0.0, - "routers_loss": 0.0017453476320952177, + "routers_loss": 0.0016894340515136719, "skip_count": 0.0, "step": 4520, "text_loss": 0.6623797416687012 @@ -42957,13 +42957,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0228271484375, "learning_rate": 0.0006602372711800531, - "loss": 0.0045, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7292869.0, "repeat_count": 0.0, - "routers_loss": 0.0035615740343928337, + "routers_loss": 0.003522444050759077, "skip_count": 0.0, "step": 4522, "text_loss": 0.5488807559013367 @@ -42976,13 +42976,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0234375, + "grad_norm": 0.0240478515625, "learning_rate": 0.0006599440487694521, - "loss": 0.0068, + "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 7296618.0, "repeat_count": 0.0, - "routers_loss": 0.001281693228520453, + "routers_loss": 0.0011981099378317595, "skip_count": 0.0, "step": 4524, "text_loss": 0.4128517210483551 @@ -42995,13 +42995,13 @@ "f1_execute": 0.978723406791687, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.0269775390625, "learning_rate": 0.00065965076506799, - "loss": 0.0048, + "loss": 0.0047, "macro_f1": 0.9262410998344421, "num_tokens": 7300481.0, "repeat_count": 3.0, - "routers_loss": 0.011079956777393818, + "routers_loss": 0.010548194870352745, "skip_count": 2.0, "step": 4526, "text_loss": 0.26450902223587036 @@ -43014,13 +43014,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02685546875, + "grad_norm": 0.028076171875, "learning_rate": 0.0006593574201880536, - "loss": 0.0062, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7303272.0, "repeat_count": 0.0, - "routers_loss": 0.005837739445269108, + "routers_loss": 0.005642973352223635, "skip_count": 1.0, "step": 4528, "text_loss": 0.35269856452941895 @@ -43033,13 +43033,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032958984375, + "grad_norm": 0.031982421875, "learning_rate": 0.000659064014242053, - "loss": 0.0046, + "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7306615.0, "repeat_count": 0.0, - "routers_loss": 0.004657972138375044, + "routers_loss": 0.004171932581812143, "skip_count": 1.0, "step": 4530, "text_loss": 0.18814080953598022 @@ -43052,13 +43052,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.02734375, "learning_rate": 0.0006587705473424223, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7310368.0, "repeat_count": 0.0, - "routers_loss": 0.0025366253685206175, + "routers_loss": 0.002289367141202092, "skip_count": 2.0, "step": 4532, "text_loss": 0.7363705635070801 @@ -43071,13 +43071,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.036376953125, "learning_rate": 0.000658477019601618, - "loss": 0.0072, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 7313788.0, "repeat_count": 0.0, - "routers_loss": 0.005018982570618391, + "routers_loss": 0.004440625663846731, "skip_count": 1.0, "step": 4534, "text_loss": 0.8126176595687866 @@ -43090,13 +43090,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.029296875, + "grad_norm": 0.031982421875, "learning_rate": 0.0006581834311321211, - "loss": 0.0085, + "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 7317864.0, "repeat_count": 0.0, - "routers_loss": 0.0013490618439391255, + "routers_loss": 0.0013160990783944726, "skip_count": 2.0, "step": 4536, "text_loss": 0.7015916109085083 @@ -43109,32 +43109,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.04736328125, "learning_rate": 0.000657889782046435, - "loss": 0.0062, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7320693.0, "repeat_count": 0.0, - "routers_loss": 0.0035190414637327194, + "routers_loss": 0.0032275544945150614, "skip_count": 2.0, "step": 4538, "text_loss": 0.6481677293777466 }, { "acc_repeat": 0.0, - "acc_skip": 1.0, - "avg_layers": 27.0, + "acc_skip": 0.0, + "avg_layers": 28.0, "epoch": 21.314646316407398, - "f1_execute": 1.0, + "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, - "f1_skip": 1.0, - "grad_norm": 0.0223388671875, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, "learning_rate": 0.0006575960724570865, - "loss": 0.0055, - "macro_f1": 0.6666666865348816, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, "num_tokens": 7324335.0, "repeat_count": 0.0, - "routers_loss": 0.007447404786944389, + "routers_loss": 0.009769129566848278, "skip_count": 1.0, "step": 4540, "text_loss": 0.22194676101207733 @@ -43147,13 +43147,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04248046875, + "grad_norm": 0.042724609375, "learning_rate": 0.0006573023024766258, - "loss": 0.0062, + "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 7327431.0, "repeat_count": 2.0, - "routers_loss": 0.0030924465972930193, + "routers_loss": 0.0036973082460463047, "skip_count": 4.0, "step": 4542, "text_loss": 0.475127637386322 @@ -43166,13 +43166,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.0361328125, "learning_rate": 0.000657008472217626, - "loss": 0.0061, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7330262.0, "repeat_count": 0.0, - "routers_loss": 0.000717726768925786, + "routers_loss": 0.0007046440150588751, "skip_count": 0.0, "step": 4544, "text_loss": 0.2649917006492615 @@ -43185,13 +43185,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044921875, + "grad_norm": 0.04443359375, "learning_rate": 0.0006567145817926836, - "loss": 0.0088, + "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7333110.0, "repeat_count": 0.0, - "routers_loss": 0.0029236951377242804, + "routers_loss": 0.0026714997366070747, "skip_count": 0.0, "step": 4546, "text_loss": 0.5490524768829346 @@ -43204,13 +43204,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.043701171875, + "grad_norm": 0.062255859375, "learning_rate": 0.0006564206313144175, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7336101.0, "repeat_count": 0.0, - "routers_loss": 0.006141145247966051, + "routers_loss": 0.006552211008965969, "skip_count": 0.0, "step": 4548, "text_loss": 0.14098678529262543 @@ -43223,13 +43223,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0301513671875, + "grad_norm": 0.02978515625, "learning_rate": 0.0006561266208954707, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7339435.0, "repeat_count": 0.0, - "routers_loss": 0.0035991708282381296, + "routers_loss": 0.0035560601390898228, "skip_count": 2.0, "step": 4550, "text_loss": 0.20412275195121765 @@ -43242,13 +43242,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.0269775390625, "learning_rate": 0.0006558325506485081, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7342609.0, "repeat_count": 0.0, - "routers_loss": 0.0024792153853923082, + "routers_loss": 0.0020106974989175797, "skip_count": 1.0, "step": 4552, "text_loss": 0.6184256076812744 @@ -43263,11 +43263,11 @@ "f1_skip": 0.0, "grad_norm": 0.050537109375, "learning_rate": 0.0006555384206862183, - "loss": 0.0091, + "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 7345614.0, "repeat_count": 0.0, - "routers_loss": 0.0014447715366259217, + "routers_loss": 0.0014235252747312188, "skip_count": 0.0, "step": 4554, "text_loss": 1.0108838081359863 @@ -43280,13 +43280,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.0302734375, "learning_rate": 0.0006552442311213121, - "loss": 0.0043, + "loss": 0.0041, "macro_f1": 0.3272727429866791, "num_tokens": 7348957.0, "repeat_count": 1.0, - "routers_loss": 0.02027573436498642, + "routers_loss": 0.01703745685517788, "skip_count": 0.0, "step": 4556, "text_loss": 0.21315747499465942 @@ -43299,13 +43299,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0263671875, + "grad_norm": 0.0269775390625, "learning_rate": 0.0006549499820665237, "loss": 0.0077, "macro_f1": 0.5934640765190125, "num_tokens": 7352724.0, "repeat_count": 0.0, - "routers_loss": 0.012388292700052261, + "routers_loss": 0.013315381482243538, "skip_count": 3.0, "step": 4558, "text_loss": 0.34369465708732605 @@ -43318,13 +43318,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0308837890625, + "grad_norm": 0.033935546875, "learning_rate": 0.00065465567363461, - "loss": 0.0074, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7356592.0, "repeat_count": 0.0, - "routers_loss": 0.0011293066199868917, + "routers_loss": 0.0017354936571791768, "skip_count": 0.0, "step": 4560, "text_loss": 0.6267461180686951 @@ -43337,13 +43337,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.037841796875, "learning_rate": 0.0006543613059383503, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7359774.0, "repeat_count": 0.0, - "routers_loss": 0.011833512224256992, + "routers_loss": 0.011646085418760777, "skip_count": 2.0, "step": 4562, "text_loss": 0.4400193989276886 @@ -43356,13 +43356,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.03076171875, "learning_rate": 0.0006540668790905471, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7362765.0, "repeat_count": 0.0, - "routers_loss": 0.002059109043329954, + "routers_loss": 0.0019345436012372375, "skip_count": 0.0, "step": 4564, "text_loss": 0.49204275012016296 @@ -43375,13 +43375,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.023681640625, + "grad_norm": 0.02685546875, "learning_rate": 0.0006537723932040251, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7366337.0, "repeat_count": 0.0, - "routers_loss": 0.005968277342617512, + "routers_loss": 0.00562885170802474, "skip_count": 1.0, "step": 4566, "text_loss": 0.22566382586956024 @@ -43394,13 +43394,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.03515625, "learning_rate": 0.0006534778483916319, - "loss": 0.0087, + "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 7369851.0, "repeat_count": 2.0, - "routers_loss": 0.005483719054609537, + "routers_loss": 0.005508176051080227, "skip_count": 2.0, "step": 4568, "text_loss": 0.8057850003242493 @@ -43413,13 +43413,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.03076171875, "learning_rate": 0.0006531832447662377, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7373918.0, "repeat_count": 0.0, - "routers_loss": 0.006533551495522261, + "routers_loss": 0.006460923235863447, "skip_count": 2.0, "step": 4570, "text_loss": 0.5141497254371643 @@ -43432,13 +43432,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04345703125, + "grad_norm": 0.042236328125, "learning_rate": 0.0006528885824407351, - "loss": 0.0083, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7376674.0, "repeat_count": 0.0, - "routers_loss": 0.003182383719831705, + "routers_loss": 0.0032120654359459877, "skip_count": 0.0, "step": 4572, "text_loss": 0.1281338930130005 @@ -43451,13 +43451,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.054931640625, + "grad_norm": 0.052490234375, "learning_rate": 0.0006525938615280394, - "loss": 0.0118, + "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 7379791.0, "repeat_count": 0.0, - "routers_loss": 0.00441814586520195, + "routers_loss": 0.00443810923025012, "skip_count": 0.0, "step": 4574, "text_loss": 0.268352210521698 @@ -43470,13 +43470,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0296630859375, + "grad_norm": 0.027587890625, "learning_rate": 0.000652299082141088, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7382886.0, "repeat_count": 0.0, - "routers_loss": 0.008390357717871666, + "routers_loss": 0.008284369483590126, "skip_count": 2.0, "step": 4576, "text_loss": 0.30193832516670227 @@ -43489,13 +43489,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03076171875, "learning_rate": 0.0006520042443928411, - "loss": 0.0071, + "loss": 0.0068, "macro_f1": 0.8823530077934265, "num_tokens": 7386036.0, "repeat_count": 2.0, - "routers_loss": 0.03992438316345215, + "routers_loss": 0.03383317217230797, "skip_count": 1.0, "step": 4578, "text_loss": 0.23106542229652405 @@ -43508,13 +43508,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0419921875, "learning_rate": 0.000651709348396281, - "loss": 0.0053, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7388908.0, "repeat_count": 0.0, - "routers_loss": 0.001781110418960452, + "routers_loss": 0.0017075951909646392, "skip_count": 1.0, "step": 4580, "text_loss": 0.386099249124527 @@ -43527,13 +43527,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0296630859375, + "grad_norm": 0.031494140625, "learning_rate": 0.0006514143942644124, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7392004.0, "repeat_count": 0.0, - "routers_loss": 0.009884138591587543, + "routers_loss": 0.009516917169094086, "skip_count": 1.0, "step": 4582, "text_loss": 0.3162059485912323 @@ -43546,13 +43546,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04736328125, + "grad_norm": 0.051513671875, "learning_rate": 0.0006511193821102623, - "loss": 0.0078, + "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 7395538.0, "repeat_count": 0.0, - "routers_loss": 0.0032415634486824274, + "routers_loss": 0.0031392278615385294, "skip_count": 0.0, "step": 4584, "text_loss": 0.5536221861839294 @@ -43565,13 +43565,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.06298828125, + "grad_norm": 0.05224609375, "learning_rate": 0.0006508243120468799, - "loss": 0.0054, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7398461.0, "repeat_count": 0.0, - "routers_loss": 0.0014925460563972592, + "routers_loss": 0.0014138511614874005, "skip_count": 0.0, "step": 4586, "text_loss": 0.7934318780899048 @@ -43584,13 +43584,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0242919921875, + "grad_norm": 0.0224609375, "learning_rate": 0.0006505291841873367, - "loss": 0.0053, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7401611.0, "repeat_count": 0.0, - "routers_loss": 0.0005577150150202215, + "routers_loss": 0.0005265916115604341, "skip_count": 0.0, "step": 4588, "text_loss": 0.4569905698299408 @@ -43609,7 +43609,7 @@ "macro_f1": 0.3333333432674408, "num_tokens": 7404641.0, "repeat_count": 0.0, - "routers_loss": 0.0023784362711012363, + "routers_loss": 0.0024988956283777952, "skip_count": 0.0, "step": 4590, "text_loss": 0.49998772144317627 @@ -43622,13 +43622,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0235595703125, + "grad_norm": 0.025634765625, "learning_rate": 0.0006499387555321636, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7407574.0, "repeat_count": 0.0, - "routers_loss": 0.004376447293907404, + "routers_loss": 0.004110113717615604, "skip_count": 1.0, "step": 4592, "text_loss": 0.5679413676261902 @@ -43641,13 +43641,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.031982421875, "learning_rate": 0.0006496434549627874, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7410806.0, "repeat_count": 0.0, - "routers_loss": 0.0032524678390473127, + "routers_loss": 0.0032845588866621256, "skip_count": 0.0, "step": 4594, "text_loss": 0.35515281558036804 @@ -43660,13 +43660,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.036865234375, + "grad_norm": 0.03857421875, "learning_rate": 0.0006493480970497568, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7413402.0, "repeat_count": 0.0, - "routers_loss": 0.009982835501432419, + "routers_loss": 0.010577172972261906, "skip_count": 1.0, "step": 4596, "text_loss": 0.26111698150634766 @@ -43679,13 +43679,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.035888671875, "learning_rate": 0.0006490526819062537, - "loss": 0.0093, + "loss": 0.0091, "macro_f1": 1.0, "num_tokens": 7417236.0, "repeat_count": 1.0, - "routers_loss": 0.002379048615694046, + "routers_loss": 0.002054794691503048, "skip_count": 2.0, "step": 4598, "text_loss": 0.6480993628501892 @@ -43698,13 +43698,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07763671875, + "grad_norm": 0.07958984375, "learning_rate": 0.0006487572096454818, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7420278.0, "repeat_count": 0.0, - "routers_loss": 0.0017828276613727212, + "routers_loss": 0.0017989084590226412, "skip_count": 0.0, "step": 4600, "text_loss": 0.4935401678085327 @@ -43717,13 +43717,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03369140625, "learning_rate": 0.0006484616803806665, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7423866.0, "repeat_count": 0.0, - "routers_loss": 0.007584894075989723, + "routers_loss": 0.006671485956758261, "skip_count": 1.0, "step": 4602, "text_loss": 0.15030258893966675 @@ -43736,13 +43736,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.0311279296875, "learning_rate": 0.0006481660942250552, - "loss": 0.0054, + "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7426884.0, "repeat_count": 0.0, - "routers_loss": 0.008093188516795635, + "routers_loss": 0.008334980346262455, "skip_count": 3.0, "step": 4604, "text_loss": 0.29933279752731323 @@ -43755,13 +43755,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030029296875, + "grad_norm": 0.03125, "learning_rate": 0.0006478704512919173, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7431017.0, "repeat_count": 0.0, - "routers_loss": 0.012283207848668098, + "routers_loss": 0.011923984624445438, "skip_count": 3.0, "step": 4606, "text_loss": 0.35141825675964355 @@ -43774,13 +43774,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0267333984375, + "grad_norm": 0.0279541015625, "learning_rate": 0.0006475747516945432, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7434406.0, "repeat_count": 0.0, - "routers_loss": 0.0035103289410471916, + "routers_loss": 0.0031092462595552206, "skip_count": 3.0, "step": 4608, "text_loss": 0.21021464467048645 @@ -43793,13 +43793,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0279541015625, + "grad_norm": 0.02978515625, "learning_rate": 0.000647278995546246, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7437204.0, "repeat_count": 1.0, - "routers_loss": 0.0006666383123956621, + "routers_loss": 0.0006713552866131067, "skip_count": 0.0, "step": 4610, "text_loss": 0.4052635431289673 @@ -43812,13 +43812,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.03173828125, "learning_rate": 0.0006469831829603598, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7439741.0, "repeat_count": 0.0, - "routers_loss": 0.0028148891869932413, + "routers_loss": 0.0022583482787013054, "skip_count": 2.0, "step": 4612, "text_loss": 0.5443860292434692 @@ -43831,13 +43831,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04296875, + "grad_norm": 0.044677734375, "learning_rate": 0.0006466873140502407, - "loss": 0.0074, + "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 7443619.0, "repeat_count": 0.0, - "routers_loss": 0.0037154473830014467, + "routers_loss": 0.004187075886875391, "skip_count": 2.0, "step": 4614, "text_loss": 0.30709847807884216 @@ -43850,13 +43850,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.019775390625, + "grad_norm": 0.0194091796875, "learning_rate": 0.0006463913889292661, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 7446696.0, "repeat_count": 0.0, - "routers_loss": 0.007844357751309872, + "routers_loss": 0.008314833045005798, "skip_count": 0.0, "step": 4616, "text_loss": 0.22949637472629547 @@ -43869,13 +43869,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.02685546875, "learning_rate": 0.0006460954077108353, - "loss": 0.0047, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7450377.0, "repeat_count": 0.0, - "routers_loss": 0.001379768829792738, + "routers_loss": 0.001277514616958797, "skip_count": 0.0, "step": 4618, "text_loss": 0.37715134024620056 @@ -43888,13 +43888,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0289306640625, + "grad_norm": 0.02734375, "learning_rate": 0.0006457993705083684, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 7453271.0, "repeat_count": 0.0, - "routers_loss": 0.0019801959861069918, + "routers_loss": 0.0022756033577024937, "skip_count": 2.0, "step": 4620, "text_loss": 0.7373883128166199 @@ -43907,13 +43907,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.019775390625, + "grad_norm": 0.02099609375, "learning_rate": 0.0006455032774353078, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7456492.0, "repeat_count": 0.0, - "routers_loss": 0.0038891383446753025, + "routers_loss": 0.0039057908579707146, "skip_count": 2.0, "step": 4622, "text_loss": 0.5058769583702087 @@ -43926,13 +43926,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0196533203125, + "grad_norm": 0.0203857421875, "learning_rate": 0.0006452071286051169, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7459619.0, "repeat_count": 0.0, - "routers_loss": 0.001924185431562364, + "routers_loss": 0.0019458672031760216, "skip_count": 0.0, "step": 4624, "text_loss": 0.5110082030296326 @@ -43945,13 +43945,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.033447265625, "learning_rate": 0.0006449109241312802, - "loss": 0.0059, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7462552.0, "repeat_count": 0.0, - "routers_loss": 0.000527520664036274, + "routers_loss": 0.0002716891176532954, "skip_count": 1.0, "step": 4626, "text_loss": 0.6197522878646851 @@ -43964,13 +43964,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.05126953125, + "grad_norm": 0.045654296875, "learning_rate": 0.0006446146641273042, - "loss": 0.0063, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7466769.0, "repeat_count": 0.0, - "routers_loss": 0.004048905335366726, + "routers_loss": 0.0037578947376459837, "skip_count": 2.0, "step": 4628, "text_loss": 0.1653924286365509 @@ -43983,13 +43983,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.022705078125, "learning_rate": 0.000644318348706716, - "loss": 0.0074, + "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 7470216.0, "repeat_count": 0.0, - "routers_loss": 0.001336342073045671, + "routers_loss": 0.0012791058979928493, "skip_count": 0.0, "step": 4630, "text_loss": 0.7114694118499756 @@ -44002,13 +44002,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.032958984375, "learning_rate": 0.0006440219779830643, - "loss": 0.0076, + "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 7472975.0, "repeat_count": 0.0, - "routers_loss": 0.007155329454690218, + "routers_loss": 0.00736592011526227, "skip_count": 2.0, "step": 4632, "text_loss": 0.26601463556289673 @@ -44021,13 +44021,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.0322265625, "learning_rate": 0.000643725552069919, - "loss": 0.0071, + "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7475672.0, "repeat_count": 0.0, - "routers_loss": 0.0004819786408916116, + "routers_loss": 0.00045455715735442936, "skip_count": 0.0, "step": 4634, "text_loss": 0.5028402805328369 @@ -44040,13 +44040,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0225830078125, + "grad_norm": 0.022705078125, "learning_rate": 0.0006434290710808711, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7478850.0, "repeat_count": 0.0, - "routers_loss": 0.004355283919721842, + "routers_loss": 0.004247233271598816, "skip_count": 2.0, "step": 4636, "text_loss": 0.12746070325374603 @@ -44059,13 +44059,13 @@ "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.04150390625, + "grad_norm": 0.04052734375, "learning_rate": 0.0006431325351295324, "loss": 0.0083, "macro_f1": 0.5427350401878357, "num_tokens": 7481747.0, "repeat_count": 1.0, - "routers_loss": 0.04843593016266823, + "routers_loss": 0.047564394772052765, "skip_count": 2.0, "step": 4638, "text_loss": 0.24056802690029144 @@ -44078,13 +44078,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0615234375, "learning_rate": 0.0006428359443295362, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7484885.0, "repeat_count": 0.0, - "routers_loss": 0.0010549267753958702, + "routers_loss": 0.0011175100225955248, "skip_count": 0.0, "step": 4640, "text_loss": 0.6265338063240051 @@ -44097,13 +44097,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0361328125, + "grad_norm": 0.035400390625, "learning_rate": 0.0006425392987945369, "loss": 0.0086, "macro_f1": 0.5492662787437439, "num_tokens": 7487973.0, "repeat_count": 0.0, - "routers_loss": 0.016608718782663345, + "routers_loss": 0.016879938542842865, "skip_count": 2.0, "step": 4642, "text_loss": 0.2523447275161743 @@ -44116,13 +44116,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, - "grad_norm": 0.0322265625, + "grad_norm": 0.032958984375, "learning_rate": 0.0006422425986382093, "loss": 0.0055, "macro_f1": 0.5934640765190125, "num_tokens": 7491024.0, "repeat_count": 0.0, - "routers_loss": 0.01848086155951023, + "routers_loss": 0.018616504967212677, "skip_count": 3.0, "step": 4644, "text_loss": 0.38890624046325684 @@ -44135,13 +44135,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.038818359375, + "grad_norm": 0.0400390625, "learning_rate": 0.0006419458439742496, "loss": 0.0056, "macro_f1": 0.3272727429866791, "num_tokens": 7494199.0, "repeat_count": 0.0, - "routers_loss": 0.022435056045651436, + "routers_loss": 0.023129139095544815, "skip_count": 1.0, "step": 4646, "text_loss": 0.4060848355293274 @@ -44154,13 +44154,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.03271484375, "learning_rate": 0.0006416490349163747, - "loss": 0.0083, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 7497287.0, "repeat_count": 0.0, - "routers_loss": 0.0018073184182867408, + "routers_loss": 0.0018601802876219153, "skip_count": 0.0, "step": 4648, "text_loss": 0.3387545943260193 @@ -44173,13 +44173,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033935546875, + "grad_norm": 0.03173828125, "learning_rate": 0.0006413521715783225, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 7500598.0, "repeat_count": 0.0, - "routers_loss": 0.0017208937788382173, + "routers_loss": 0.0017482215771451592, "skip_count": 0.0, "step": 4650, "text_loss": 0.4290996193885803 @@ -44192,13 +44192,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.040771484375, "learning_rate": 0.0006410552540738514, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.3272727429866791, "num_tokens": 7503252.0, "repeat_count": 1.0, - "routers_loss": 0.04149872064590454, + "routers_loss": 0.0420118011534214, "skip_count": 0.0, "step": 4652, "text_loss": 0.439496248960495 @@ -44211,13 +44211,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.028076171875, + "grad_norm": 0.026611328125, "learning_rate": 0.000640758282516741, - "loss": 0.0057, + "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 7506382.0, "repeat_count": 1.0, - "routers_loss": 0.002120798220857978, + "routers_loss": 0.0017782216891646385, "skip_count": 1.0, "step": 4654, "text_loss": 0.8513308167457581 @@ -44230,13 +44230,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.0439453125, "learning_rate": 0.0006404612570207911, - "loss": 0.0103, + "loss": 0.0102, "macro_f1": 0.3272727429866791, "num_tokens": 7510423.0, "repeat_count": 0.0, - "routers_loss": 0.009855805896222591, + "routers_loss": 0.010385853238403797, "skip_count": 0.0, "step": 4656, "text_loss": 0.7159742712974548 @@ -44249,13 +44249,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.031982421875, "learning_rate": 0.0006401641776998223, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7513394.0, "repeat_count": 0.0, - "routers_loss": 0.0011576786637306213, + "routers_loss": 0.0011917101219296455, "skip_count": 0.0, "step": 4658, "text_loss": 0.6165401339530945 @@ -44268,13 +44268,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.02734375, + "grad_norm": 0.028564453125, "learning_rate": 0.0006398670446676766, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7516828.0, "repeat_count": 3.0, - "routers_loss": 0.008810436353087425, + "routers_loss": 0.008860073052346706, "skip_count": 4.0, "step": 4660, "text_loss": 0.923275887966156 @@ -44287,13 +44287,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.042236328125, + "grad_norm": 0.041015625, "learning_rate": 0.0006395698580382153, - "loss": 0.0065, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7519764.0, "repeat_count": 0.0, - "routers_loss": 0.0005982713773846626, + "routers_loss": 0.000505418807733804, "skip_count": 0.0, "step": 4662, "text_loss": 0.6143050789833069 @@ -44306,13 +44306,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04150390625, + "grad_norm": 0.0439453125, "learning_rate": 0.0006392726179253212, - "loss": 0.0047, + "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7522390.0, "repeat_count": 0.0, - "routers_loss": 0.004173434805124998, + "routers_loss": 0.004020806401968002, "skip_count": 1.0, "step": 4664, "text_loss": 0.6935067176818848 @@ -44325,13 +44325,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.04052734375, + "grad_norm": 0.052001953125, "learning_rate": 0.0006389753244428972, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7525821.0, "repeat_count": 1.0, - "routers_loss": 0.008930242620408535, + "routers_loss": 0.00957963801920414, "skip_count": 2.0, "step": 4666, "text_loss": 0.3350338637828827 @@ -44344,13 +44344,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.04296875, + "grad_norm": 0.039794921875, "learning_rate": 0.0006386779777048666, - "loss": 0.0066, + "loss": 0.0063, "macro_f1": 0.6601307392120361, "num_tokens": 7529513.0, "repeat_count": 1.0, - "routers_loss": 0.02444119192659855, + "routers_loss": 0.020673364400863647, "skip_count": 2.0, "step": 4668, "text_loss": 0.47800472378730774 @@ -44363,13 +44363,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0255126953125, + "grad_norm": 0.0257568359375, "learning_rate": 0.0006383805778251735, - "loss": 0.005, + "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7533450.0, "repeat_count": 0.0, - "routers_loss": 0.007665765006095171, + "routers_loss": 0.007217096630483866, "skip_count": 1.0, "step": 4670, "text_loss": 0.4506106972694397 @@ -44382,13 +44382,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.0257568359375, "learning_rate": 0.0006380831249177817, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 7536287.0, "repeat_count": 1.0, - "routers_loss": 0.008599632419645786, + "routers_loss": 0.007001714315265417, "skip_count": 0.0, "step": 4672, "text_loss": 0.4081715941429138 @@ -44401,13 +44401,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.027587890625, "learning_rate": 0.0006377856190966762, - "loss": 0.0055, + "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7539442.0, "repeat_count": 0.0, - "routers_loss": 0.0014951099874451756, + "routers_loss": 0.0015112817054614425, "skip_count": 0.0, "step": 4674, "text_loss": 0.21451139450073242 @@ -44420,13 +44420,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0341796875, "learning_rate": 0.0006374880604758615, - "loss": 0.0086, + "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 7542594.0, "repeat_count": 0.0, - "routers_loss": 0.00817523431032896, + "routers_loss": 0.007311929017305374, "skip_count": 2.0, "step": 4676, "text_loss": 0.14785248041152954 @@ -44439,13 +44439,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02734375, + "grad_norm": 0.0306396484375, "learning_rate": 0.0006371904491693626, - "loss": 0.0052, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7545780.0, "repeat_count": 0.0, - "routers_loss": 0.007712447550147772, + "routers_loss": 0.007489737123250961, "skip_count": 1.0, "step": 4678, "text_loss": 0.2248108983039856 @@ -44458,13 +44458,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.031494140625, "learning_rate": 0.0006368927852912247, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 7548287.0, "repeat_count": 1.0, - "routers_loss": 0.010472464375197887, + "routers_loss": 0.009772555902600288, "skip_count": 1.0, "step": 4680, "text_loss": 0.1566995233297348 @@ -44477,13 +44477,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.0322265625, "learning_rate": 0.0006365950689555133, - "loss": 0.0065, + "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7551424.0, "repeat_count": 0.0, - "routers_loss": 0.0019706315360963345, + "routers_loss": 0.002134992741048336, "skip_count": 0.0, "step": 4682, "text_loss": 0.7322417497634888 @@ -44496,13 +44496,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0257568359375, + "grad_norm": 0.0240478515625, "learning_rate": 0.0006362973002763139, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7554182.0, "repeat_count": 1.0, - "routers_loss": 0.0077865333296358585, + "routers_loss": 0.008511497639119625, "skip_count": 4.0, "step": 4684, "text_loss": 0.24387991428375244 @@ -44515,13 +44515,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046875, + "grad_norm": 0.04931640625, "learning_rate": 0.0006359994793677319, "loss": 0.0095, "macro_f1": 0.6666666865348816, "num_tokens": 7557044.0, "repeat_count": 0.0, - "routers_loss": 0.004420961253345013, + "routers_loss": 0.004151526838541031, "skip_count": 2.0, "step": 4686, "text_loss": 0.6139411330223083 @@ -44534,13 +44534,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0233154296875, + "grad_norm": 0.0228271484375, "learning_rate": 0.0006357016063438928, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7560231.0, "repeat_count": 0.0, - "routers_loss": 0.0011308451648801565, + "routers_loss": 0.0009724601986818016, "skip_count": 0.0, "step": 4688, "text_loss": 0.7875718474388123 @@ -44553,13 +44553,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.0308837890625, "learning_rate": 0.0006354036813189421, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7562953.0, "repeat_count": 0.0, - "routers_loss": 0.0008846965502016246, + "routers_loss": 0.0008926765876822174, "skip_count": 0.0, "step": 4690, "text_loss": 0.5195512771606445 @@ -44572,13 +44572,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.03759765625, "learning_rate": 0.0006351057044070455, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 7566137.0, "repeat_count": 0.0, - "routers_loss": 0.003313175868242979, + "routers_loss": 0.0031294538639485836, "skip_count": 0.0, "step": 4692, "text_loss": 0.7288873195648193 @@ -44591,13 +44591,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.026123046875, "learning_rate": 0.0006348076757223877, - "loss": 0.004, + "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 7569073.0, "repeat_count": 0.0, - "routers_loss": 0.0016258886316791177, + "routers_loss": 0.0015065820189192891, "skip_count": 2.0, "step": 4694, "text_loss": 0.7242236137390137 @@ -44610,13 +44610,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0218505859375, + "grad_norm": 0.0235595703125, "learning_rate": 0.0006345095953791746, - "loss": 0.0075, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7573025.0, "repeat_count": 0.0, - "routers_loss": 0.0005521657876670361, + "routers_loss": 0.0005603441968560219, "skip_count": 0.0, "step": 4696, "text_loss": 0.34443899989128113 @@ -44629,13 +44629,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.023681640625, + "grad_norm": 0.02490234375, "learning_rate": 0.0006342114634916307, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 7576546.0, "repeat_count": 0.0, - "routers_loss": 0.0011082915589213371, + "routers_loss": 0.0011047758162021637, "skip_count": 0.0, "step": 4698, "text_loss": 0.4892682731151581 @@ -44648,13 +44648,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0218505859375, + "grad_norm": 0.02490234375, "learning_rate": 0.0006339132801740008, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7580711.0, "repeat_count": 0.0, - "routers_loss": 0.001985425828024745, + "routers_loss": 0.0019803126342594624, "skip_count": 2.0, "step": 4700, "text_loss": 0.4479489028453827 @@ -44667,13 +44667,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.037353515625, + "grad_norm": 0.0458984375, "learning_rate": 0.0006336150455405494, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 7583385.0, "repeat_count": 1.0, - "routers_loss": 0.0005365543183870614, + "routers_loss": 0.0005326359532773495, "skip_count": 0.0, "step": 4702, "text_loss": 0.627504825592041 @@ -44686,13 +44686,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.021240234375, + "grad_norm": 0.0194091796875, "learning_rate": 0.0006333167597055604, - "loss": 0.0037, + "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 7586584.0, "repeat_count": 0.0, - "routers_loss": 0.0006241816445253789, + "routers_loss": 0.0005587987834587693, "skip_count": 0.0, "step": 4704, "text_loss": 0.43891432881355286 @@ -44705,13 +44705,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02734375, + "grad_norm": 0.0263671875, "learning_rate": 0.0006330184227833376, - "loss": 0.0045, + "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7590408.0, "repeat_count": 0.0, - "routers_loss": 0.00726567255333066, + "routers_loss": 0.007053783163428307, "skip_count": 2.0, "step": 4706, "text_loss": 0.19946859776973724 @@ -44724,13 +44724,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0224609375, + "grad_norm": 0.0228271484375, "learning_rate": 0.0006327200348882043, - "loss": 0.0047, + "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7593857.0, "repeat_count": 1.0, - "routers_loss": 0.0011741123162209988, + "routers_loss": 0.0009479080326855183, "skip_count": 0.0, "step": 4708, "text_loss": 0.7973214387893677 @@ -44743,13 +44743,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07177734375, + "grad_norm": 0.1259765625, "learning_rate": 0.0006324215961345032, - "loss": 0.0057, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7596429.0, "repeat_count": 0.0, - "routers_loss": 0.0012845906894654036, + "routers_loss": 0.0012403312139213085, "skip_count": 0.0, "step": 4710, "text_loss": 0.48477989435195923 @@ -44762,13 +44762,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.032470703125, + "grad_norm": 0.03515625, "learning_rate": 0.0006321231066365966, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7599618.0, "repeat_count": 0.0, - "routers_loss": 0.0005659137386828661, + "routers_loss": 0.0005520360427908599, "skip_count": 0.0, "step": 4712, "text_loss": 0.44222453236579895 @@ -44781,13 +44781,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.039306640625, + "grad_norm": 0.04150390625, "learning_rate": 0.0006318245665088665, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 7603180.0, "repeat_count": 0.0, - "routers_loss": 0.0018121730536222458, + "routers_loss": 0.0015553623670712113, "skip_count": 0.0, "step": 4714, "text_loss": 0.5132410526275635 @@ -44800,13 +44800,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02734375, + "grad_norm": 0.027587890625, "learning_rate": 0.0006315259758657138, - "loss": 0.0049, + "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7606457.0, "repeat_count": 0.0, - "routers_loss": 0.004462256096303463, + "routers_loss": 0.004210884217172861, "skip_count": 1.0, "step": 4716, "text_loss": 0.39850690960884094 @@ -44819,13 +44819,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.021728515625, + "grad_norm": 0.02294921875, "learning_rate": 0.0006312273348215589, - "loss": 0.0069, + "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 7609317.0, "repeat_count": 1.0, - "routers_loss": 0.0011878227815032005, + "routers_loss": 0.001220117206685245, "skip_count": 0.0, "step": 4718, "text_loss": 0.3509018123149872 @@ -44838,13 +44838,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.03271484375, "learning_rate": 0.0006309286434908419, - "loss": 0.008, + "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 7613076.0, "repeat_count": 0.0, - "routers_loss": 0.008010992780327797, + "routers_loss": 0.007768960203975439, "skip_count": 2.0, "step": 4720, "text_loss": 0.33361560106277466 @@ -44857,32 +44857,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031494140625, + "grad_norm": 0.0322265625, "learning_rate": 0.0006306299019880217, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7616242.0, "repeat_count": 0.0, - "routers_loss": 0.005931100342422724, + "routers_loss": 0.006226699333637953, "skip_count": 0.0, "step": 4722, "text_loss": 0.23661087453365326 }, { "acc_repeat": 1.0, - "acc_skip": 1.0, - "avg_layers": 28.0, + "acc_skip": 0.0, + "avg_layers": 29.0, "epoch": 22.17845611975345, - "f1_execute": 1.0, + "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, - "f1_skip": 1.0, - "grad_norm": 0.0478515625, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, "learning_rate": 0.0006303311104275766, - "loss": 0.0075, - "macro_f1": 1.0, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, "num_tokens": 7619069.0, "repeat_count": 1.0, - "routers_loss": 0.013775430619716644, + "routers_loss": 0.015590761788189411, "skip_count": 1.0, "step": 4724, "text_loss": 0.23373056948184967 @@ -44895,13 +44895,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0302734375, + "grad_norm": 0.028564453125, "learning_rate": 0.0006300322689240041, - "loss": 0.0077, + "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 7622581.0, "repeat_count": 1.0, - "routers_loss": 0.0069032334722578526, + "routers_loss": 0.006862971931695938, "skip_count": 2.0, "step": 4726, "text_loss": 0.8301828503608704 @@ -44914,13 +44914,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.038818359375, "learning_rate": 0.0006297333775918209, - "loss": 0.0089, + "loss": 0.0086, "macro_f1": 1.0, "num_tokens": 7625566.0, "repeat_count": 1.0, - "routers_loss": 0.006230995524674654, + "routers_loss": 0.006256614346057177, "skip_count": 1.0, "step": 4728, "text_loss": 0.3756707012653351 @@ -44933,13 +44933,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.0301513671875, "learning_rate": 0.0006294344365455626, - "loss": 0.0078, + "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 7629047.0, "repeat_count": 1.0, - "routers_loss": 0.009772522374987602, + "routers_loss": 0.009151885285973549, "skip_count": 2.0, "step": 4730, "text_loss": 0.33362850546836853 @@ -44952,13 +44952,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.035888671875, + "grad_norm": 0.035400390625, "learning_rate": 0.0006291354458997841, - "loss": 0.007, + "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 7631847.0, "repeat_count": 0.0, - "routers_loss": 0.000902787665836513, + "routers_loss": 0.0009307434665970504, "skip_count": 0.0, "step": 4732, "text_loss": 0.4572524130344391 @@ -44971,13 +44971,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0262451171875, + "grad_norm": 0.0272216796875, "learning_rate": 0.0006288364057690591, - "loss": 0.0072, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7635181.0, "repeat_count": 0.0, - "routers_loss": 0.0004107247805222869, + "routers_loss": 0.00041220212006010115, "skip_count": 0.0, "step": 4734, "text_loss": 0.40211325883865356 @@ -44990,13 +44990,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03515625, + "grad_norm": 0.03955078125, "learning_rate": 0.0006285373162679804, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7637752.0, "repeat_count": 0.0, - "routers_loss": 0.0008339153719134629, + "routers_loss": 0.0006696670898236334, "skip_count": 2.0, "step": 4736, "text_loss": 0.7588053345680237 @@ -45009,13 +45009,13 @@ "f1_execute": 0.9777777791023254, "f1_repeat": 0.8571428656578064, "f1_skip": 1.0, - "grad_norm": 0.0390625, + "grad_norm": 0.03759765625, "learning_rate": 0.0006282381775111597, "loss": 0.0081, "macro_f1": 0.9449735879898071, "num_tokens": 7640719.0, "repeat_count": 4.0, - "routers_loss": 0.015601541846990585, + "routers_loss": 0.016283133998513222, "skip_count": 2.0, "step": 4738, "text_loss": 0.5697863101959229 @@ -45028,13 +45028,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.037841796875, + "grad_norm": 0.03955078125, "learning_rate": 0.0006279389896132274, - "loss": 0.0062, + "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7643524.0, "repeat_count": 0.0, - "routers_loss": 0.00740925082936883, + "routers_loss": 0.00763951288536191, "skip_count": 3.0, "step": 4740, "text_loss": 0.548592209815979 @@ -45049,11 +45049,11 @@ "f1_skip": 1.0, "grad_norm": 0.03857421875, "learning_rate": 0.0006276397526888329, - "loss": 0.0095, + "loss": 0.0094, "macro_f1": 0.925203263759613, "num_tokens": 7646919.0, "repeat_count": 3.0, - "routers_loss": 0.03791050612926483, + "routers_loss": 0.038590483367443085, "skip_count": 5.0, "step": 4742, "text_loss": 0.27226054668426514 @@ -45066,13 +45066,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03564453125, + "grad_norm": 0.037109375, "learning_rate": 0.0006273404668526443, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7650404.0, "repeat_count": 0.0, - "routers_loss": 0.0013001165352761745, + "routers_loss": 0.0012555639259517193, "skip_count": 0.0, "step": 4744, "text_loss": 0.47892290353775024 @@ -45085,13 +45085,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0235595703125, + "grad_norm": 0.0233154296875, "learning_rate": 0.0006270411322193488, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7652942.0, "repeat_count": 1.0, - "routers_loss": 0.001371108810417354, + "routers_loss": 0.0015356402145698667, "skip_count": 0.0, "step": 4746, "text_loss": 0.5515767931938171 @@ -45104,13 +45104,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.0390625, "learning_rate": 0.0006267417489036517, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 7656269.0, "repeat_count": 0.0, - "routers_loss": 0.00558467349037528, + "routers_loss": 0.005182140972465277, "skip_count": 0.0, "step": 4748, "text_loss": 0.3496028184890747 @@ -45123,13 +45123,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.062255859375, + "grad_norm": 0.0615234375, "learning_rate": 0.0006264423170202773, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7658664.0, "repeat_count": 0.0, - "routers_loss": 0.0044899932108819485, + "routers_loss": 0.004144361708313227, "skip_count": 0.0, "step": 4750, "text_loss": 0.2786032557487488 @@ -45142,13 +45142,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0269775390625, + "grad_norm": 0.0267333984375, "learning_rate": 0.0006261428366839685, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 7661471.0, "repeat_count": 0.0, - "routers_loss": 0.0002782076771836728, + "routers_loss": 0.00035335420398041606, "skip_count": 0.0, "step": 4752, "text_loss": 0.4838487505912781 @@ -45161,32 +45161,32 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.030517578125, "learning_rate": 0.0006258433080094868, - "loss": 0.0044, + "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7664593.0, "repeat_count": 0.0, - "routers_loss": 0.010121302679181099, + "routers_loss": 0.0103341368958354, "skip_count": 2.0, "step": 4754, "text_loss": 0.24325360357761383 }, { "acc_repeat": 0.0, - "acc_skip": 0.5, - "avg_layers": 27.0, + "acc_skip": 1.0, + "avg_layers": 26.0, "epoch": 22.328734957440563, - "f1_execute": 0.9811320900917053, + "f1_execute": 1.0, "f1_repeat": 0.0, - "f1_skip": 0.6666666865348816, - "grad_norm": 0.035888671875, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, "learning_rate": 0.0006255437311116119, - "loss": 0.0082, - "macro_f1": 0.5492662787437439, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, "num_tokens": 7667573.0, "repeat_count": 0.0, - "routers_loss": 0.015182681381702423, + "routers_loss": 0.014633853919804096, "skip_count": 2.0, "step": 4756, "text_loss": 0.21569855511188507 @@ -45199,13 +45199,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029052734375, + "grad_norm": 0.0284423828125, "learning_rate": 0.0006252441061051426, - "loss": 0.0056, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7671171.0, "repeat_count": 0.0, - "routers_loss": 0.005404457915574312, + "routers_loss": 0.004900569561868906, "skip_count": 0.0, "step": 4758, "text_loss": 0.12832018733024597 @@ -45218,13 +45218,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.028564453125, "learning_rate": 0.0006249444331048955, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 7673932.0, "repeat_count": 0.0, - "routers_loss": 0.002476566471159458, + "routers_loss": 0.0020371589343994856, "skip_count": 0.0, "step": 4760, "text_loss": 0.38652482628822327 @@ -45237,13 +45237,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.025634765625, "learning_rate": 0.000624644712225706, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7677396.0, "repeat_count": 0.0, - "routers_loss": 0.003040580777451396, + "routers_loss": 0.0028059002943336964, "skip_count": 2.0, "step": 4762, "text_loss": 0.7937633395195007 @@ -45256,13 +45256,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0238037109375, + "grad_norm": 0.02587890625, "learning_rate": 0.0006243449435824276, - "loss": 0.005, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 7680392.0, "repeat_count": 0.0, - "routers_loss": 0.0007072070729918778, + "routers_loss": 0.0007225095760077238, "skip_count": 0.0, "step": 4764, "text_loss": 0.5690395832061768 @@ -45275,13 +45275,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.0281982421875, "learning_rate": 0.0006240451272899321, - "loss": 0.0063, + "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 7684121.0, "repeat_count": 0.0, - "routers_loss": 0.0024044427555054426, + "routers_loss": 0.002052050782367587, "skip_count": 1.0, "step": 4766, "text_loss": 0.5321336984634399 @@ -45294,13 +45294,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.03515625, "learning_rate": 0.0006237452634631099, - "loss": 0.0071, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7687236.0, "repeat_count": 1.0, - "routers_loss": 0.003375594737008214, + "routers_loss": 0.0039039517287164927, "skip_count": 0.0, "step": 4768, "text_loss": 0.30823320150375366 @@ -45313,13 +45313,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0299072265625, + "grad_norm": 0.0303955078125, "learning_rate": 0.0006234453522168694, - "loss": 0.0087, + "loss": 0.0084, "macro_f1": 0.5492662787437439, "num_tokens": 7690355.0, "repeat_count": 0.0, - "routers_loss": 0.016256459057331085, + "routers_loss": 0.014570238068699837, "skip_count": 2.0, "step": 4770, "text_loss": 0.21501587331295013 @@ -45332,13 +45332,13 @@ "f1_execute": 0.949999988079071, "f1_repeat": 0.800000011920929, "f1_skip": 0.9090909361839294, - "grad_norm": 0.048583984375, + "grad_norm": 0.04541015625, "learning_rate": 0.000623145393666137, - "loss": 0.0071, + "loss": 0.0069, "macro_f1": 0.886363685131073, "num_tokens": 7693559.0, "repeat_count": 3.0, - "routers_loss": 0.06640318781137466, + "routers_loss": 0.061707716435194016, "skip_count": 6.0, "step": 4772, "text_loss": 0.24371100962162018 @@ -45351,13 +45351,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0255126953125, + "grad_norm": 0.0281982421875, "learning_rate": 0.0006228453879258576, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 7696422.0, "repeat_count": 0.0, - "routers_loss": 0.004930639173835516, + "routers_loss": 0.005053870379924774, "skip_count": 2.0, "step": 4774, "text_loss": 0.237778440117836 @@ -45370,13 +45370,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05224609375, + "grad_norm": 0.060302734375, "learning_rate": 0.0006225453351109934, - "loss": 0.0088, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7700460.0, "repeat_count": 0.0, - "routers_loss": 0.0018267944687977433, + "routers_loss": 0.0017990898340940475, "skip_count": 0.0, "step": 4776, "text_loss": 0.612456738948822 @@ -45389,13 +45389,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.033203125, + "grad_norm": 0.03466796875, "learning_rate": 0.000622245235336526, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7703330.0, "repeat_count": 0.0, - "routers_loss": 0.004756844602525234, + "routers_loss": 0.004507021512836218, "skip_count": 2.0, "step": 4778, "text_loss": 0.36898812651634216 @@ -45408,13 +45408,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.03076171875, "learning_rate": 0.0006219450887174537, - "loss": 0.0065, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7707243.0, "repeat_count": 0.0, - "routers_loss": 0.00667968625202775, + "routers_loss": 0.006295828148722649, "skip_count": 1.0, "step": 4780, "text_loss": 0.14474599063396454 @@ -45427,13 +45427,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0294189453125, + "grad_norm": 0.03515625, "learning_rate": 0.0006216448953687932, - "loss": 0.0071, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7711121.0, "repeat_count": 0.0, - "routers_loss": 0.004780827090144157, + "routers_loss": 0.005049831233918667, "skip_count": 0.0, "step": 4782, "text_loss": 0.4696790277957916 @@ -45446,13 +45446,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02783203125, + "grad_norm": 0.028076171875, "learning_rate": 0.0006213446554055795, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7714889.0, "repeat_count": 0.0, - "routers_loss": 0.0006851314683444798, + "routers_loss": 0.0006010758224874735, "skip_count": 0.0, "step": 4784, "text_loss": 0.46253830194473267 @@ -45465,13 +45465,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0277099609375, + "grad_norm": 0.028564453125, "learning_rate": 0.0006210443689428649, - "loss": 0.0062, + "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 7718420.0, "repeat_count": 3.0, - "routers_loss": 0.00759447505697608, + "routers_loss": 0.006691234186291695, "skip_count": 1.0, "step": 4786, "text_loss": 0.579987645149231 @@ -45484,13 +45484,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.033447265625, + "grad_norm": 0.035400390625, "learning_rate": 0.00062074403609572, - "loss": 0.0076, + "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 7721720.0, "repeat_count": 0.0, - "routers_loss": 0.0019895671866834164, + "routers_loss": 0.001864895923063159, "skip_count": 0.0, "step": 4788, "text_loss": 0.325242817401886 @@ -45503,13 +45503,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0284423828125, + "grad_norm": 0.02880859375, "learning_rate": 0.0006204436569792324, - "loss": 0.009, + "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 7724916.0, "repeat_count": 0.0, - "routers_loss": 0.0020269565284252167, + "routers_loss": 0.00202955212444067, "skip_count": 0.0, "step": 4790, "text_loss": 0.49637556076049805 @@ -45522,13 +45522,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.032470703125, "learning_rate": 0.0006201432317085083, - "loss": 0.0086, + "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 7728081.0, "repeat_count": 1.0, - "routers_loss": 0.004511707462370396, + "routers_loss": 0.0037843603640794754, "skip_count": 0.0, "step": 4792, "text_loss": 0.38812628388404846 @@ -45541,13 +45541,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.0301513671875, "learning_rate": 0.0006198427603986711, - "loss": 0.0065, + "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7731457.0, "repeat_count": 0.0, - "routers_loss": 0.011496705003082752, + "routers_loss": 0.012036679312586784, "skip_count": 3.0, "step": 4794, "text_loss": 0.2996312379837036 @@ -45560,13 +45560,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0247802734375, "learning_rate": 0.0006195422431648623, - "loss": 0.0058, + "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7734595.0, "repeat_count": 0.0, - "routers_loss": 0.0009816563688218594, + "routers_loss": 0.0008874868508428335, "skip_count": 1.0, "step": 4796, "text_loss": 0.3203189969062805 @@ -45579,13 +45579,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0281982421875, + "grad_norm": 0.0283203125, "learning_rate": 0.0006192416801222403, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 7737565.0, "repeat_count": 1.0, - "routers_loss": 0.0031518745236098766, + "routers_loss": 0.0032894534524530172, "skip_count": 1.0, "step": 4798, "text_loss": 0.3283322751522064 @@ -45598,13 +45598,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.056640625, + "grad_norm": 0.053955078125, "learning_rate": 0.0006189410713859815, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 7740439.0, "repeat_count": 0.0, - "routers_loss": 0.009768245741724968, + "routers_loss": 0.009667043574154377, "skip_count": 2.0, "step": 4800, "text_loss": 0.25219282507896423 @@ -45617,13 +45617,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.03857421875, "learning_rate": 0.0006186404170712797, - "loss": 0.0094, + "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 7743813.0, "repeat_count": 0.0, - "routers_loss": 0.012967129237949848, + "routers_loss": 0.012643060646951199, "skip_count": 4.0, "step": 4802, "text_loss": 0.22567439079284668 @@ -45636,13 +45636,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.03125, "learning_rate": 0.0006183397172933462, - "loss": 0.006, + "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 7747182.0, "repeat_count": 0.0, - "routers_loss": 0.002813612576574087, + "routers_loss": 0.002678517485037446, "skip_count": 0.0, "step": 4804, "text_loss": 0.19188879430294037 @@ -45655,13 +45655,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0228271484375, + "grad_norm": 0.0233154296875, "learning_rate": 0.0006180389721674101, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 7750735.0, "repeat_count": 0.0, - "routers_loss": 0.0013491564895957708, + "routers_loss": 0.0013385121710598469, "skip_count": 0.0, "step": 4806, "text_loss": 0.5860441327095032 @@ -45674,13 +45674,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031005859375, + "grad_norm": 0.0284423828125, "learning_rate": 0.000617738181808717, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7753843.0, "repeat_count": 0.0, - "routers_loss": 0.0035517180804163218, + "routers_loss": 0.0034869094379246235, "skip_count": 1.0, "step": 4808, "text_loss": 0.4366260766983032 @@ -45693,13 +45693,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.044677734375, + "grad_norm": 0.0478515625, "learning_rate": 0.0006174373463325306, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7757039.0, "repeat_count": 0.0, - "routers_loss": 0.0014680681051686406, + "routers_loss": 0.0013648992171511054, "skip_count": 0.0, "step": 4810, "text_loss": 0.5217258334159851 @@ -45712,13 +45712,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.025390625, + "grad_norm": 0.0274658203125, "learning_rate": 0.0006171364658541314, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 7760016.0, "repeat_count": 1.0, - "routers_loss": 0.004398355260491371, + "routers_loss": 0.0038017008919268847, "skip_count": 2.0, "step": 4812, "text_loss": 0.8130963444709778 @@ -45731,13 +45731,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.039794921875, + "grad_norm": 0.03466796875, "learning_rate": 0.0006168355404888177, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 7762961.0, "repeat_count": 0.0, - "routers_loss": 0.006870325654745102, + "routers_loss": 0.006867518648505211, "skip_count": 2.0, "step": 4814, "text_loss": 0.17822521924972534 @@ -45750,13 +45750,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.03076171875, "learning_rate": 0.0006165345703519043, - "loss": 0.0059, + "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7766399.0, "repeat_count": 0.0, - "routers_loss": 0.0004937525955028832, + "routers_loss": 0.0004653502255678177, "skip_count": 0.0, "step": 4816, "text_loss": 0.5316070914268494 @@ -45769,13 +45769,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.036376953125, + "grad_norm": 0.035888671875, "learning_rate": 0.0006162335555587238, - "loss": 0.0081, + "loss": 0.008, "macro_f1": 1.0, "num_tokens": 7769039.0, "repeat_count": 1.0, - "routers_loss": 0.0014112245989963412, + "routers_loss": 0.0016906452365219593, "skip_count": 1.0, "step": 4818, "text_loss": 0.5680997967720032 @@ -45788,13 +45788,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.051025390625, + "grad_norm": 0.05615234375, "learning_rate": 0.0006159324962246257, "loss": 0.0066, "macro_f1": 0.3333333432674408, "num_tokens": 7772768.0, "repeat_count": 0.0, - "routers_loss": 0.0026105360593646765, + "routers_loss": 0.002541248919442296, "skip_count": 0.0, "step": 4820, "text_loss": 0.6169226169586182 @@ -45807,13 +45807,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.031494140625, "learning_rate": 0.0006156313924649762, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7775545.0, "repeat_count": 0.0, - "routers_loss": 0.008672980591654778, + "routers_loss": 0.008644679561257362, "skip_count": 2.0, "step": 4822, "text_loss": 0.2211475968360901 @@ -45826,13 +45826,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0294189453125, + "grad_norm": 0.02880859375, "learning_rate": 0.0006153302443951589, - "loss": 0.006, + "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7778837.0, "repeat_count": 0.0, - "routers_loss": 0.0035932520404458046, + "routers_loss": 0.0041346061043441296, "skip_count": 2.0, "step": 4824, "text_loss": 0.5369775891304016 @@ -45851,7 +45851,7 @@ "macro_f1": 0.3333333432674408, "num_tokens": 7782309.0, "repeat_count": 0.0, - "routers_loss": 0.0012299016816541553, + "routers_loss": 0.0012756052892655134, "skip_count": 0.0, "step": 4826, "text_loss": 0.5294989943504333 @@ -45864,13 +45864,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.026123046875, + "grad_norm": 0.02734375, "learning_rate": 0.0006147278157866403, - "loss": 0.0047, + "loss": 0.0046, "macro_f1": 0.3272727429866791, "num_tokens": 7785565.0, "repeat_count": 0.0, - "routers_loss": 0.02901158109307289, + "routers_loss": 0.029718991369009018, "skip_count": 1.0, "step": 4828, "text_loss": 0.6920449733734131 @@ -45883,13 +45883,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03466796875, + "grad_norm": 0.032470703125, "learning_rate": 0.0006144265354787906, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7788218.0, "repeat_count": 0.0, - "routers_loss": 0.00484448904171586, + "routers_loss": 0.004829924553632736, "skip_count": 0.0, "step": 4830, "text_loss": 0.17072243988513947 @@ -45902,13 +45902,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07421875, + "grad_norm": 0.06689453125, "learning_rate": 0.0006141252113224767, - "loss": 0.0045, + "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7790788.0, "repeat_count": 0.0, - "routers_loss": 0.002483877120539546, + "routers_loss": 0.00254037044942379, "skip_count": 0.0, "step": 4832, "text_loss": 0.20075996220111847 @@ -45921,13 +45921,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0157470703125, + "grad_norm": 0.01519775390625, "learning_rate": 0.0006138238434331666, - "loss": 0.0046, + "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7793913.0, "repeat_count": 0.0, - "routers_loss": 0.0004437893512658775, + "routers_loss": 0.0004426188243087381, "skip_count": 0.0, "step": 4834, "text_loss": 0.695742130279541 @@ -45940,13 +45940,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.022216796875, "learning_rate": 0.000613522431926345, - "loss": 0.0037, + "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7796932.0, "repeat_count": 1.0, - "routers_loss": 0.005339824128895998, + "routers_loss": 0.005176798906177282, "skip_count": 3.0, "step": 4836, "text_loss": 0.4910822808742523 @@ -45959,13 +45959,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025634765625, + "grad_norm": 0.0262451171875, "learning_rate": 0.0006132209769175132, - "loss": 0.0047, + "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7800686.0, "repeat_count": 0.0, - "routers_loss": 0.004220465198159218, + "routers_loss": 0.004120545461773872, "skip_count": 0.0, "step": 4838, "text_loss": 0.3701378405094147 @@ -45978,13 +45978,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.022216796875, + "grad_norm": 0.0218505859375, "learning_rate": 0.0006129194785221894, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7804765.0, "repeat_count": 0.0, - "routers_loss": 0.00431162491440773, + "routers_loss": 0.0043835826218128204, "skip_count": 0.0, "step": 4840, "text_loss": 0.343635618686676 @@ -45997,13 +45997,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.035400390625, + "grad_norm": 0.035888671875, "learning_rate": 0.0006126179368559086, - "loss": 0.0059, + "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7807498.0, "repeat_count": 0.0, - "routers_loss": 0.0013186183059588075, + "routers_loss": 0.001394893741235137, "skip_count": 1.0, "step": 4842, "text_loss": 0.47756674885749817 @@ -46016,13 +46016,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.041259765625, + "grad_norm": 0.048828125, "learning_rate": 0.000612316352034222, - "loss": 0.0053, + "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7810784.0, "repeat_count": 0.0, - "routers_loss": 0.0030679802875965834, + "routers_loss": 0.0031262130942195654, "skip_count": 2.0, "step": 4844, "text_loss": 0.13077901303768158 @@ -46035,13 +46035,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.0546875, + "grad_norm": 0.058349609375, "learning_rate": 0.0006120147241726972, - "loss": 0.0083, + "loss": 0.0081, "macro_f1": 0.8823530077934265, "num_tokens": 7814754.0, "repeat_count": 2.0, - "routers_loss": 0.02003045752644539, + "routers_loss": 0.016139274463057518, "skip_count": 1.0, "step": 4846, "text_loss": 0.18850074708461761 @@ -46054,13 +46054,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04638671875, + "grad_norm": 0.041015625, "learning_rate": 0.0006117130533869189, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7818245.0, "repeat_count": 0.0, - "routers_loss": 0.0010096770711243153, + "routers_loss": 0.0009124451316893101, "skip_count": 0.0, "step": 4848, "text_loss": 0.42503559589385986 @@ -46073,13 +46073,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.022705078125, + "grad_norm": 0.0224609375, "learning_rate": 0.0006114113397924878, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7822214.0, "repeat_count": 0.0, - "routers_loss": 0.0014919894747436047, + "routers_loss": 0.0015132242115214467, "skip_count": 0.0, "step": 4850, "text_loss": 0.16767354309558868 @@ -46092,13 +46092,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.040283203125, + "grad_norm": 0.04150390625, "learning_rate": 0.0006111095835050212, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7825019.0, "repeat_count": 2.0, - "routers_loss": 0.0065781730227172375, + "routers_loss": 0.006253300234675407, "skip_count": 2.0, "step": 4852, "text_loss": 0.44826745986938477 @@ -46111,13 +46111,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.025146484375, + "grad_norm": 0.024169921875, "learning_rate": 0.0006108077846401524, - "loss": 0.0037, + "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7828113.0, "repeat_count": 0.0, - "routers_loss": 0.00244692200794816, + "routers_loss": 0.0024391328915953636, "skip_count": 0.0, "step": 4854, "text_loss": 0.2009880244731903 @@ -46130,13 +46130,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.028564453125, + "grad_norm": 0.03369140625, "learning_rate": 0.0006105059433135317, - "loss": 0.0079, + "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 7831177.0, "repeat_count": 1.0, - "routers_loss": 0.002367270179092884, + "routers_loss": 0.0020866121631115675, "skip_count": 1.0, "step": 4856, "text_loss": 0.7082528471946716 @@ -46149,13 +46149,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0272216796875, + "grad_norm": 0.025390625, "learning_rate": 0.0006102040596408251, - "loss": 0.0072, + "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 7834485.0, "repeat_count": 0.0, - "routers_loss": 0.005648438818752766, + "routers_loss": 0.004373365081846714, "skip_count": 1.0, "step": 4858, "text_loss": 0.2541539669036865 @@ -46168,13 +46168,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.02734375, "learning_rate": 0.0006099021337377148, - "loss": 0.0049, + "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7837749.0, "repeat_count": 0.0, - "routers_loss": 0.0042733135633170605, + "routers_loss": 0.004309024661779404, "skip_count": 0.0, "step": 4860, "text_loss": 0.3163885176181793 @@ -46187,13 +46187,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, - "grad_norm": 0.04296875, + "grad_norm": 0.049072265625, "learning_rate": 0.0006096001657198995, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 0.6122449040412903, "num_tokens": 7840979.0, "repeat_count": 0.0, - "routers_loss": 0.023403044790029526, + "routers_loss": 0.023044804111123085, "skip_count": 4.0, "step": 4862, "text_loss": 0.49609798192977905 @@ -46206,13 +46206,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.02490234375, + "grad_norm": 0.0250244140625, "learning_rate": 0.0006092981557030941, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 7844905.0, "repeat_count": 1.0, - "routers_loss": 0.011750902980566025, + "routers_loss": 0.010683654807507992, "skip_count": 3.0, "step": 4864, "text_loss": 0.16866883635520935 @@ -46225,13 +46225,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02099609375, + "grad_norm": 0.0224609375, "learning_rate": 0.0006089961038030291, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7847800.0, "repeat_count": 0.0, - "routers_loss": 0.0012288664001971483, + "routers_loss": 0.0011224723421037197, "skip_count": 0.0, "step": 4866, "text_loss": 0.5093055367469788 @@ -46244,13 +46244,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.037353515625, "learning_rate": 0.0006086940101354515, - "loss": 0.0051, + "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7850983.0, "repeat_count": 0.0, - "routers_loss": 0.004861745983362198, + "routers_loss": 0.003944621421396732, "skip_count": 1.0, "step": 4868, "text_loss": 0.5753747224807739 @@ -46263,13 +46263,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.0311279296875, + "grad_norm": 0.0289306640625, "learning_rate": 0.0006083918748161244, "loss": 0.0069, "macro_f1": 0.5492662787437439, "num_tokens": 7855041.0, "repeat_count": 0.0, - "routers_loss": 0.025008518248796463, + "routers_loss": 0.02532145567238331, "skip_count": 2.0, "step": 4870, "text_loss": 0.8082366585731506 @@ -46282,13 +46282,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.029052734375, "learning_rate": 0.0006080896979608262, - "loss": 0.0054, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 7858058.0, "repeat_count": 0.0, - "routers_loss": 0.0007896953029558063, + "routers_loss": 0.0007558314246125519, "skip_count": 0.0, "step": 4872, "text_loss": 0.6476574540138245 @@ -46301,13 +46301,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05029296875, + "grad_norm": 0.0615234375, "learning_rate": 0.000607787479685352, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 7861223.0, "repeat_count": 0.0, - "routers_loss": 0.0008885554852895439, + "routers_loss": 0.0009224560926668346, "skip_count": 0.0, "step": 4874, "text_loss": 0.5012133717536926 @@ -46320,13 +46320,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.03515625, "learning_rate": 0.0006074852201055121, - "loss": 0.0084, + "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 7864180.0, "repeat_count": 0.0, - "routers_loss": 0.0029017175547778606, + "routers_loss": 0.0028308273758739233, "skip_count": 0.0, "step": 4876, "text_loss": 0.7447214722633362 @@ -46339,13 +46339,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.04541015625, + "grad_norm": 0.052734375, "learning_rate": 0.0006071829193371331, - "loss": 0.0058, + "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7866726.0, "repeat_count": 0.0, - "routers_loss": 0.0021245202515274286, + "routers_loss": 0.0021505290642380714, "skip_count": 0.0, "step": 4878, "text_loss": 0.5444929599761963 @@ -46358,13 +46358,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0751953125, + "grad_norm": 0.11376953125, "learning_rate": 0.0006068805774960573, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7870166.0, "repeat_count": 0.0, - "routers_loss": 0.0021692372392863035, + "routers_loss": 0.0021109723020344973, "skip_count": 0.0, "step": 4880, "text_loss": 0.3577263355255127 @@ -46377,13 +46377,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02880859375, + "grad_norm": 0.0308837890625, "learning_rate": 0.0006065781946981425, - "loss": 0.0057, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7873028.0, "repeat_count": 0.0, - "routers_loss": 0.0026705453637987375, + "routers_loss": 0.0027144821360707283, "skip_count": 0.0, "step": 4882, "text_loss": 0.28464797139167786 @@ -46396,13 +46396,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.05517578125, + "grad_norm": 0.05224609375, "learning_rate": 0.0006062757710592624, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7876747.0, "repeat_count": 0.0, - "routers_loss": 0.0004111099406145513, + "routers_loss": 0.0004638207610696554, "skip_count": 0.0, "step": 4884, "text_loss": 0.381534606218338 @@ -46415,13 +46415,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0299072265625, + "grad_norm": 0.02685546875, "learning_rate": 0.0006059733066953066, - "loss": 0.0045, + "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 7879524.0, "repeat_count": 1.0, - "routers_loss": 0.0019129335414618254, + "routers_loss": 0.002225410658866167, "skip_count": 2.0, "step": 4886, "text_loss": 0.5167883634567261 @@ -46434,13 +46434,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0279541015625, + "grad_norm": 0.028564453125, "learning_rate": 0.0006056708017221796, - "loss": 0.0043, + "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7882809.0, "repeat_count": 0.0, - "routers_loss": 0.0046940455213189125, + "routers_loss": 0.00419368501752615, "skip_count": 1.0, "step": 4888, "text_loss": 0.22688335180282593 @@ -46453,13 +46453,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.030517578125, + "grad_norm": 0.036376953125, "learning_rate": 0.000605368256255802, - "loss": 0.0054, + "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7886310.0, "repeat_count": 0.0, - "routers_loss": 0.0017953033093363047, + "routers_loss": 0.0017340193735435605, "skip_count": 1.0, "step": 4890, "text_loss": 1.0128135681152344 @@ -46472,13 +46472,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.07373046875, + "grad_norm": 0.0712890625, "learning_rate": 0.0006050656704121098, - "loss": 0.0098, + "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 7889483.0, "repeat_count": 0.0, - "routers_loss": 0.0018971457611769438, + "routers_loss": 0.0016647159354761243, "skip_count": 0.0, "step": 4892, "text_loss": 0.2213262915611267 @@ -46491,13 +46491,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.03271484375, "learning_rate": 0.0006047630443070547, - "loss": 0.0063, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7892615.0, "repeat_count": 0.0, - "routers_loss": 0.0040974365547299385, + "routers_loss": 0.0038971947506070137, "skip_count": 3.0, "step": 4894, "text_loss": 0.45751357078552246 @@ -46510,13 +46510,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.034912109375, + "grad_norm": 0.0341796875, "learning_rate": 0.0006044603780566032, - "loss": 0.0053, + "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 7895747.0, "repeat_count": 1.0, - "routers_loss": 0.0035370320547372103, + "routers_loss": 0.0036852145567536354, "skip_count": 1.0, "step": 4896, "text_loss": 0.13489919900894165 @@ -46529,13 +46529,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.024658203125, + "grad_norm": 0.0235595703125, "learning_rate": 0.0006041576717767379, - "loss": 0.0058, + "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 7899155.0, "repeat_count": 0.0, - "routers_loss": 0.008930077776312828, + "routers_loss": 0.007661987561732531, "skip_count": 1.0, "step": 4898, "text_loss": 0.281853586435318 @@ -46548,13 +46548,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.03125, "learning_rate": 0.0006038549255834563, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 7901667.0, "repeat_count": 2.0, - "routers_loss": 0.014533254317939281, + "routers_loss": 0.01836695335805416, "skip_count": 5.0, "step": 4900, "text_loss": 0.24879895150661469 @@ -46567,13 +46567,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0262451171875, + "grad_norm": 0.02880859375, "learning_rate": 0.000603552139592771, - "loss": 0.0058, + "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7904506.0, "repeat_count": 0.0, - "routers_loss": 0.0012445948086678982, + "routers_loss": 0.0011829182039946318, "skip_count": 0.0, "step": 4902, "text_loss": 0.7550268769264221 @@ -46586,13 +46586,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.037109375, + "grad_norm": 0.03662109375, "learning_rate": 0.0006032493139207106, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7907316.0, "repeat_count": 1.0, - "routers_loss": 0.0022323536686599255, + "routers_loss": 0.0022891140542924404, "skip_count": 0.0, "step": 4904, "text_loss": 0.37596020102500916 @@ -46605,13 +46605,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0286865234375, + "grad_norm": 0.0289306640625, "learning_rate": 0.0006029464486833186, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 7911283.0, "repeat_count": 0.0, - "routers_loss": 0.00221167947165668, + "routers_loss": 0.001990227960050106, "skip_count": 0.0, "step": 4906, "text_loss": 0.5879577994346619 @@ -46624,13 +46624,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0220947265625, + "grad_norm": 0.0211181640625, "learning_rate": 0.0006026435439966531, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7913907.0, "repeat_count": 0.0, - "routers_loss": 0.0025787551421672106, + "routers_loss": 0.0026039890944957733, "skip_count": 1.0, "step": 4908, "text_loss": 0.41484713554382324 @@ -46643,13 +46643,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0272216796875, "learning_rate": 0.0006023405999767879, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7916772.0, "repeat_count": 0.0, - "routers_loss": 0.00866663083434105, + "routers_loss": 0.009183229878544807, "skip_count": 1.0, "step": 4910, "text_loss": 0.20732562243938446 @@ -46662,13 +46662,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0274658203125, + "grad_norm": 0.0302734375, "learning_rate": 0.0006020376167398116, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 7919346.0, "repeat_count": 0.0, - "routers_loss": 0.005565170664340258, + "routers_loss": 0.005508727394044399, "skip_count": 1.0, "step": 4912, "text_loss": 0.41416165232658386 @@ -46681,13 +46681,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03271484375, + "grad_norm": 0.033203125, "learning_rate": 0.0006017345944018284, - "loss": 0.0052, + "loss": 0.0051, "macro_f1": 0.3272727429866791, "num_tokens": 7922404.0, "repeat_count": 0.0, - "routers_loss": 0.008860527537763119, + "routers_loss": 0.008651934564113617, "skip_count": 0.0, "step": 4914, "text_loss": 0.4290519952774048 @@ -46700,13 +46700,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.034423828125, + "grad_norm": 0.0299072265625, "learning_rate": 0.0006014315330789563, - "loss": 0.0078, + "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 7925165.0, "repeat_count": 0.0, - "routers_loss": 0.003383385483175516, + "routers_loss": 0.003601635340601206, "skip_count": 1.0, "step": 4916, "text_loss": 0.8447931408882141 @@ -46719,13 +46719,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0341796875, + "grad_norm": 0.034912109375, "learning_rate": 0.0006011284328873296, - "loss": 0.004, + "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 7928146.0, "repeat_count": 1.0, - "routers_loss": 0.0047926693223416805, + "routers_loss": 0.0049415635876357555, "skip_count": 2.0, "step": 4918, "text_loss": 0.32237401604652405 @@ -46738,13 +46738,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.027099609375, + "grad_norm": 0.0291748046875, "learning_rate": 0.0006008252939430967, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7931163.0, "repeat_count": 0.0, - "routers_loss": 0.002505079610273242, + "routers_loss": 0.0024150956887751818, "skip_count": 0.0, "step": 4920, "text_loss": 0.2251713126897812 @@ -46757,13 +46757,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03125, + "grad_norm": 0.04150390625, "learning_rate": 0.0006005221163624209, - "loss": 0.0056, + "loss": 0.0057, "macro_f1": 0.3272727429866791, "num_tokens": 7934084.0, "repeat_count": 1.0, - "routers_loss": 0.0305723175406456, + "routers_loss": 0.03181030973792076, "skip_count": 0.0, "step": 4922, "text_loss": 0.4962928593158722 @@ -46776,13 +46776,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.046630859375, + "grad_norm": 0.054931640625, "learning_rate": 0.0006002189002614806, - "loss": 0.0091, + "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 7937021.0, "repeat_count": 0.0, - "routers_loss": 0.002330876188352704, + "routers_loss": 0.00227518193423748, "skip_count": 2.0, "step": 4924, "text_loss": 0.34440335631370544 @@ -46795,13 +46795,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0277099609375, "learning_rate": 0.0005999156457564685, - "loss": 0.0064, + "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 7940205.0, "repeat_count": 0.0, - "routers_loss": 0.0043139951303601265, + "routers_loss": 0.004331593867391348, "skip_count": 1.0, "step": 4926, "text_loss": 0.14114083349704742 @@ -46814,13 +46814,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0296630859375, + "grad_norm": 0.03369140625, "learning_rate": 0.0005996123529635925, - "loss": 0.0066, + "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 7945174.0, "repeat_count": 0.0, - "routers_loss": 0.0005922197597101331, + "routers_loss": 0.000612895586527884, "skip_count": 0.0, "step": 4928, "text_loss": 0.3895469009876251 @@ -46833,13 +46833,13 @@ "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03369140625, + "grad_norm": 0.036376953125, "learning_rate": 0.000599309021999075, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.3272727429866791, "num_tokens": 7948716.0, "repeat_count": 0.0, - "routers_loss": 0.022591346874833107, + "routers_loss": 0.02319233864545822, "skip_count": 1.0, "step": 4930, "text_loss": 0.38103172183036804 @@ -46852,13 +46852,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0230712890625, + "grad_norm": 0.0247802734375, "learning_rate": 0.0005990056529791528, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 7952497.0, "repeat_count": 0.0, - "routers_loss": 0.003156521590426564, + "routers_loss": 0.003423231653869152, "skip_count": 0.0, "step": 4932, "text_loss": 0.30447322130203247 @@ -46871,13 +46871,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.017333984375, + "grad_norm": 0.017822265625, "learning_rate": 0.0005987022460200778, - "loss": 0.0061, + "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7955578.0, "repeat_count": 0.0, - "routers_loss": 0.0006762169650755823, + "routers_loss": 0.0007005351362749934, "skip_count": 0.0, "step": 4934, "text_loss": 0.49621838331222534 @@ -46890,13 +46890,13 @@ "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.024169921875, + "grad_norm": 0.0234375, "learning_rate": 0.0005983988012381159, - "loss": 0.006, + "loss": 0.0061, "macro_f1": 0.8823530077934265, "num_tokens": 7958741.0, "repeat_count": 2.0, - "routers_loss": 0.03957916796207428, + "routers_loss": 0.03962617367506027, "skip_count": 1.0, "step": 4936, "text_loss": 0.1920493096113205 @@ -46909,13 +46909,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0211181640625, + "grad_norm": 0.022216796875, "learning_rate": 0.0005980953187495476, - "loss": 0.0074, + "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 7962236.0, "repeat_count": 0.0, - "routers_loss": 0.0026140862610191107, + "routers_loss": 0.0026006060652434826, "skip_count": 3.0, "step": 4938, "text_loss": 0.5286803841590881 @@ -46928,13 +46928,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0218505859375, + "grad_norm": 0.0224609375, "learning_rate": 0.0005977917986706681, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7965631.0, "repeat_count": 0.0, - "routers_loss": 0.004825619049370289, + "routers_loss": 0.005010952707380056, "skip_count": 0.0, "step": 4940, "text_loss": 0.3507745563983917 @@ -46947,13 +46947,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0269775390625, + "grad_norm": 0.0291748046875, "learning_rate": 0.0005974882411177871, - "loss": 0.0054, + "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7968516.0, "repeat_count": 0.0, - "routers_loss": 0.002404073951765895, + "routers_loss": 0.0023964287247508764, "skip_count": 0.0, "step": 4942, "text_loss": 0.9110504388809204 @@ -46966,13 +46966,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03173828125, + "grad_norm": 0.0322265625, "learning_rate": 0.000597184646207228, - "loss": 0.0065, + "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 7971310.0, "repeat_count": 0.0, - "routers_loss": 0.0035465885885059834, + "routers_loss": 0.0026230409275740385, "skip_count": 1.0, "step": 4944, "text_loss": 0.4131232798099518 @@ -46985,13 +46985,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.03857421875, + "grad_norm": 0.0390625, "learning_rate": 0.0005968810140553292, - "loss": 0.0105, + "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 7974809.0, "repeat_count": 0.0, - "routers_loss": 0.0006932367105036974, + "routers_loss": 0.0007397596491500735, "skip_count": 0.0, "step": 4946, "text_loss": 0.5130466222763062 @@ -47004,13 +47004,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0257568359375, + "grad_norm": 0.0267333984375, "learning_rate": 0.0005965773447784431, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 7977800.0, "repeat_count": 0.0, - "routers_loss": 0.0009562313207425177, + "routers_loss": 0.0009955473942682147, "skip_count": 0.0, "step": 4948, "text_loss": 0.5366153717041016 @@ -47023,13 +47023,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.01318359375, + "grad_norm": 0.01373291015625, "learning_rate": 0.0005962736384929362, "loss": 0.0026, "macro_f1": 0.3333333432674408, "num_tokens": 7981027.0, "repeat_count": 0.0, - "routers_loss": 0.004678000696003437, + "routers_loss": 0.0049227322451770306, "skip_count": 0.0, "step": 4950, "text_loss": 0.17266370356082916 @@ -47042,13 +47042,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0546875, + "grad_norm": 0.06201171875, "learning_rate": 0.0005959698953151895, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7983580.0, "repeat_count": 0.0, - "routers_loss": 0.0010157873621210456, + "routers_loss": 0.0009975163266062737, "skip_count": 0.0, "step": 4952, "text_loss": 0.2474549114704132 @@ -47061,13 +47061,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0252685546875, + "grad_norm": 0.0255126953125, "learning_rate": 0.0005956661153615979, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 7986711.0, "repeat_count": 0.0, - "routers_loss": 0.0006747227744199336, + "routers_loss": 0.0006475782720372081, "skip_count": 0.0, "step": 4954, "text_loss": 0.5748327970504761 @@ -47080,13 +47080,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0240478515625, + "grad_norm": 0.02294921875, "learning_rate": 0.0005953622987485703, - "loss": 0.0067, + "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7990194.0, "repeat_count": 0.0, - "routers_loss": 0.0014360204804688692, + "routers_loss": 0.001449751085601747, "skip_count": 0.0, "step": 4956, "text_loss": 0.5163559317588806 @@ -47099,13 +47099,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0322265625, + "grad_norm": 0.0400390625, "learning_rate": 0.0005950584455925301, - "loss": 0.0045, + "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7993050.0, "repeat_count": 0.0, - "routers_loss": 0.001549707492813468, + "routers_loss": 0.0017087773885577917, "skip_count": 0.0, "step": 4958, "text_loss": 0.15892620384693146 @@ -47118,13 +47118,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0283203125, + "grad_norm": 0.0289306640625, "learning_rate": 0.0005947545560099142, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 7996383.0, "repeat_count": 0.0, - "routers_loss": 0.0047186254523694515, + "routers_loss": 0.0044417232275009155, "skip_count": 0.0, "step": 4960, "text_loss": 0.48022928833961487 @@ -47137,13 +47137,13 @@ "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, - "grad_norm": 0.033935546875, + "grad_norm": 0.031982421875, "learning_rate": 0.0005944506301171734, - "loss": 0.0068, + "loss": 0.0066, "macro_f1": 0.5492662787437439, "num_tokens": 7999843.0, "repeat_count": 0.0, - "routers_loss": 0.010887560434639454, + "routers_loss": 0.010093312710523605, "skip_count": 2.0, "step": 4962, "text_loss": 0.5050316452980042 @@ -47156,13 +47156,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.03369140625, "learning_rate": 0.0005941466680307732, - "loss": 0.0051, + "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8003504.0, "repeat_count": 0.0, - "routers_loss": 0.009678485803306103, + "routers_loss": 0.009699694812297821, "skip_count": 0.0, "step": 4964, "text_loss": 0.30474427342414856 @@ -47175,13 +47175,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, - "grad_norm": 0.038330078125, + "grad_norm": 0.040771484375, "learning_rate": 0.0005938426698671922, - "loss": 0.0099, + "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 8007427.0, "repeat_count": 1.0, - "routers_loss": 0.0018421853892505169, + "routers_loss": 0.0016759657301008701, "skip_count": 0.0, "step": 4966, "text_loss": 0.25060293078422546 @@ -47194,13 +47194,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.03759765625, + "grad_norm": 0.04443359375, "learning_rate": 0.0005935386357429232, - "loss": 0.0069, + "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 8010265.0, "repeat_count": 2.0, - "routers_loss": 0.006872798316180706, + "routers_loss": 0.006916914135217667, "skip_count": 3.0, "step": 4968, "text_loss": 0.49084481596946716 @@ -47213,13 +47213,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0245361328125, + "grad_norm": 0.024658203125, "learning_rate": 0.0005932345657744723, - "loss": 0.0051, + "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 8013733.0, "repeat_count": 1.0, - "routers_loss": 0.017219332978129387, + "routers_loss": 0.017182426527142525, "skip_count": 5.0, "step": 4970, "text_loss": 0.2705717980861664 @@ -47232,13 +47232,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.026611328125, + "grad_norm": 0.0272216796875, "learning_rate": 0.00059293046007836, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 8017068.0, "repeat_count": 0.0, - "routers_loss": 0.008250568993389606, + "routers_loss": 0.008485594764351845, "skip_count": 2.0, "step": 4972, "text_loss": 0.18570218980312347 @@ -47251,13 +47251,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.03662109375, + "grad_norm": 0.03515625, "learning_rate": 0.0005926263187711201, - "loss": 0.008, + "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 8020185.0, "repeat_count": 0.0, - "routers_loss": 0.0022686906158924103, + "routers_loss": 0.0021750847809016705, "skip_count": 2.0, "step": 4974, "text_loss": 0.4457069933414459 @@ -47270,13 +47270,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0306396484375, + "grad_norm": 0.031005859375, "learning_rate": 0.0005923221419693001, - "loss": 0.0087, + "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 8023038.0, "repeat_count": 0.0, - "routers_loss": 0.00217001186683774, + "routers_loss": 0.0020193420350551605, "skip_count": 0.0, "step": 4976, "text_loss": 0.7394505143165588 @@ -47289,13 +47289,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.0537109375, + "grad_norm": 0.054931640625, "learning_rate": 0.0005920179297894613, - "loss": 0.0067, + "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 8026236.0, "repeat_count": 0.0, - "routers_loss": 0.0015752838226035237, + "routers_loss": 0.001450369250960648, "skip_count": 1.0, "step": 4978, "text_loss": 0.5914503335952759 @@ -47308,13 +47308,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0361328125, + "grad_norm": 0.0380859375, "learning_rate": 0.000591713682348178, - "loss": 0.0053, + "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 8028765.0, "repeat_count": 0.0, - "routers_loss": 0.0018897822592407465, + "routers_loss": 0.0017808573320508003, "skip_count": 0.0, "step": 4980, "text_loss": 0.19231407344341278 @@ -47327,13 +47327,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02587890625, + "grad_norm": 0.03173828125, "learning_rate": 0.0005914093997620388, - "loss": 0.0049, + "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 8032043.0, "repeat_count": 0.0, - "routers_loss": 0.0018230826826766133, + "routers_loss": 0.0018225493840873241, "skip_count": 0.0, "step": 4982, "text_loss": 0.3567875325679779 @@ -47346,13 +47346,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.020751953125, + "grad_norm": 0.02197265625, "learning_rate": 0.0005911050821476449, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8035086.0, "repeat_count": 0.0, - "routers_loss": 0.001746289781294763, + "routers_loss": 0.0016285666497424245, "skip_count": 0.0, "step": 4984, "text_loss": 0.34609633684158325 @@ -47365,13 +47365,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0213623046875, + "grad_norm": 0.0220947265625, "learning_rate": 0.0005908007296216119, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 8038193.0, "repeat_count": 0.0, - "routers_loss": 0.001723633729852736, + "routers_loss": 0.0014699801104143262, "skip_count": 0.0, "step": 4986, "text_loss": 0.4492359757423401 @@ -47384,13 +47384,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0250244140625, + "grad_norm": 0.0245361328125, "learning_rate": 0.000590496342300568, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 8041099.0, "repeat_count": 0.0, - "routers_loss": 0.002329434733837843, + "routers_loss": 0.002442725468426943, "skip_count": 0.0, "step": 4988, "text_loss": 0.5162975788116455 @@ -47403,13 +47403,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.031982421875, + "grad_norm": 0.03125, "learning_rate": 0.0005901919203011548, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 8044350.0, "repeat_count": 0.0, - "routers_loss": 0.00884273648262024, + "routers_loss": 0.008624207228422165, "skip_count": 2.0, "step": 4990, "text_loss": 0.2533033490180969 @@ -47422,13 +47422,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.02001953125, + "grad_norm": 0.021728515625, "learning_rate": 0.0005898874637400279, - "loss": 0.0041, + "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 8047467.0, "repeat_count": 0.0, - "routers_loss": 0.0015820686239749193, + "routers_loss": 0.0015421364223584533, "skip_count": 0.0, "step": 4992, "text_loss": 0.4890289306640625 @@ -47441,13 +47441,13 @@ "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, - "grad_norm": 0.0291748046875, + "grad_norm": 0.0279541015625, "learning_rate": 0.0005895829727338552, - "loss": 0.0066, + "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 8050626.0, "repeat_count": 1.0, - "routers_loss": 0.0024422749411314726, + "routers_loss": 0.0024516626726835966, "skip_count": 2.0, "step": 4994, "text_loss": 0.50797039270401 @@ -47460,13 +47460,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, - "grad_norm": 0.029541015625, + "grad_norm": 0.0262451171875, "learning_rate": 0.0005892784473993184, - "loss": 0.0073, + "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 8053386.0, "repeat_count": 0.0, - "routers_loss": 0.0019333140226081014, + "routers_loss": 0.0018553845584392548, "skip_count": 2.0, "step": 4996, "text_loss": 0.628828763961792 @@ -47479,13 +47479,13 @@ "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, - "grad_norm": 0.0244140625, + "grad_norm": 0.0286865234375, "learning_rate": 0.000588973887853112, - "loss": 0.0052, + "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 8055941.0, "repeat_count": 0.0, - "routers_loss": 0.004120452329516411, + "routers_loss": 0.004258487373590469, "skip_count": 0.0, "step": 4998, "text_loss": 0.2643229067325592 @@ -47498,13 +47498,13 @@ "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, - "grad_norm": 0.028564453125, + "grad_norm": 0.02783203125, "learning_rate": 0.0005886692942119441, - "loss": 0.0064, + "loss": 0.0062, "macro_f1": 0.8820862174034119, "num_tokens": 8058638.0, "repeat_count": 2.0, - "routers_loss": 0.018097922205924988, + "routers_loss": 0.019064312800765038, "skip_count": 2.0, "step": 5000, "text_loss": 0.4925006031990051 diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin index deeea733277b4031781a5b299881dd8e675e7606..a3d3ae372faf14539639f54454aa52b6ee730c4a 100644 --- a/checkpoint-5000/training_args.bin +++ b/checkpoint-5000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3f5975f57762b552c7ee29776bf32a4dbb125781a0658488d3884fb25c5296 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 size 5880 diff --git a/checkpoint-6000/chat_template.jinja b/checkpoint-6000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-6000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-6000/model-00001-of-00002.safetensors b/checkpoint-6000/model-00001-of-00002.safetensors index 1d33a92b59b4c6310c176588b68e5c1a23416ebc..08a01e1ba553cdcb2222f034a209861d7b54e284 100644 --- a/checkpoint-6000/model-00001-of-00002.safetensors +++ b/checkpoint-6000/model-00001-of-00002.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ddf8aeede6bdf2e78f716cd051b152f13d75d6629866637f946e8c7b6fd5dfe -size 2101248 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-6000/model-00002-of-00002.safetensors b/checkpoint-6000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d4b08d62e5a1c45cf9fc4ce734bc52d2f508a3f --- /dev/null +++ b/checkpoint-6000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f427de602cf2d49899f4d59a40675907bebc713187d70c900bc708c907b434 +size 1481790520 diff --git a/checkpoint-6000/model.safetensors.index.json b/checkpoint-6000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-6000/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e4ab382c8497388b7c1be77ced6bbc77cdecba0 --- /dev/null +++ b/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14927b2932f622295e2116b67605be95e37a7bb5d8c7ba9c3cc4cfe8a8904d9a +size 44191162 diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..39733b89e5c76956e6e2c15090922858a3da8be6 --- /dev/null +++ b/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52d4940820055e4d79b82c43038fd1599197353607649f88dcebf26376772128 +size 14244 diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb293ba2b6427f12e737d50fc0ce36432853fafc --- /dev/null +++ b/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e30f9ec924312828673433a0b5a4fb7c915f61d146b7694043a9f729a7b67b18 +size 1064 diff --git a/checkpoint-6000/special_tokens_map.json b/checkpoint-6000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-6000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-6000/tokenizer.json b/checkpoint-6000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-6000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-6000/tokenizer_config.json b/checkpoint-6000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-6000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-6000/trainer_state.json b/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ec63e3cc09d2d31323caf678b9719d98fc524070 --- /dev/null +++ b/checkpoint-6000/trainer_state.json @@ -0,0 +1,57034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 28.169063692398005, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.64345135714893e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6000/training_args.bin b/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880 diff --git a/checkpoint-7000/chat_template.jinja b/checkpoint-7000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-7000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-7000/config.json b/checkpoint-7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd --- /dev/null +++ b/checkpoint-7000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-7000/generation_config.json b/checkpoint-7000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35 --- /dev/null +++ b/checkpoint-7000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.55.2" +} diff --git a/checkpoint-7000/model-00001-of-00002.safetensors b/checkpoint-7000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284 --- /dev/null +++ b/checkpoint-7000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-7000/model-00002-of-00002.safetensors b/checkpoint-7000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d7309fd60ff923ab9f7e55bb709ef7618a2aa61e --- /dev/null +++ b/checkpoint-7000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6331e3cc6a3d35c61cbdca807848c35fb40765e8055baf32e920432db2d570b4 +size 1481790520 diff --git a/checkpoint-7000/model.safetensors.index.json b/checkpoint-7000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-7000/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-7000/optimizer.pt b/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bf6a78de294d3cf1525fb39507badd0601e5c81 --- /dev/null +++ b/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d15f322a2d5c12afe0a278937b7fb0f53c3b95cafa9ef5d63df75f1c1a6f3e55 +size 44191162 diff --git a/checkpoint-7000/rng_state.pth b/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e594c4f5890ddba9d82002b463b4c105950dd2f --- /dev/null +++ b/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf1acd569be92a8afde23369728c9d2c1c586d4743ed904baa10b88431b0449 +size 14244 diff --git a/checkpoint-7000/scheduler.pt b/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..494851e86432d78b6b2b520694eac58f99d04846 --- /dev/null +++ b/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a930f8891e96246408663fa341a1ed0d7741b3da13234930b0960e23d56426c8 +size 1064 diff --git a/checkpoint-7000/special_tokens_map.json b/checkpoint-7000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-7000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-7000/tokenizer.json b/checkpoint-7000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-7000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-7000/tokenizer_config.json b/checkpoint-7000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-7000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-7000/trainer_state.json b/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c4bf475dabb4d7ff2c6cb5d7a9a8cfb022defdb3 --- /dev/null +++ b/checkpoint-7000/trainer_state.json @@ -0,0 +1,66534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 32.86410331670091, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 28.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00043426384465025604, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9682677.0, + "repeat_count": 2.0, + "routers_loss": 0.0011284075444564223, + "skip_count": 0.0, + "step": 6002, + "text_loss": 0.28305181860923767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.000433957027399272, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9685310.0, + "repeat_count": 0.0, + "routers_loss": 0.0030473743099719286, + "skip_count": 1.0, + "step": 6004, + "text_loss": 0.3650054931640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00043365023545607965, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9687944.0, + "repeat_count": 1.0, + "routers_loss": 0.011621905490756035, + "skip_count": 2.0, + "step": 6006, + "text_loss": 0.5409000515937805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004333434689382423, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 9690932.0, + "repeat_count": 0.0, + "routers_loss": 0.0005297541501931846, + "skip_count": 0.0, + "step": 6008, + "text_loss": 0.4311029314994812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.216025829175226, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00043303672796331336, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9693972.0, + "repeat_count": 1.0, + "routers_loss": 0.06166421249508858, + "skip_count": 0.0, + "step": 6010, + "text_loss": 0.2658997178077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00043273001264883655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9697712.0, + "repeat_count": 0.0, + "routers_loss": 0.0018419031985104084, + "skip_count": 0.0, + "step": 6012, + "text_loss": 0.5813497304916382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004324233231123458, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9700746.0, + "repeat_count": 0.0, + "routers_loss": 0.003635555040091276, + "skip_count": 0.0, + "step": 6014, + "text_loss": 0.24211904406547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 28.24420311124156, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004321166594713651, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 9704087.0, + "repeat_count": 0.0, + "routers_loss": 0.021067705005407333, + "skip_count": 2.0, + "step": 6016, + "text_loss": 0.5908042788505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00043181002184340857, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9708695.0, + "repeat_count": 0.0, + "routers_loss": 0.0008712753187865019, + "skip_count": 0.0, + "step": 6018, + "text_loss": 0.7788549661636353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004315034103459803, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 9711631.0, + "repeat_count": 1.0, + "routers_loss": 0.03231092542409897, + "skip_count": 0.0, + "step": 6020, + "text_loss": 0.6127741932868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004311968250965743, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9715526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020149527117609978, + "skip_count": 2.0, + "step": 6022, + "text_loss": 0.49970078468322754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004308902662126748, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9718475.0, + "repeat_count": 0.0, + "routers_loss": 0.0031795913819223642, + "skip_count": 0.0, + "step": 6024, + "text_loss": 0.3254713714122772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00043058373381175567, + "loss": 0.004, + "macro_f1": 0.3272727429866791, + "num_tokens": 9722194.0, + "repeat_count": 0.0, + "routers_loss": 0.0148378387093544, + "skip_count": 1.0, + "step": 6026, + "text_loss": 0.17670343816280365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0004302772280112806, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 9725489.0, + "repeat_count": 1.0, + "routers_loss": 0.005742347799241543, + "skip_count": 2.0, + "step": 6028, + "text_loss": 0.26184776425361633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00042997074892870335, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9729416.0, + "repeat_count": 0.0, + "routers_loss": 0.0023561837151646614, + "skip_count": 0.0, + "step": 6030, + "text_loss": 0.3026008605957031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0004296642966814673, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9732559.0, + "repeat_count": 0.0, + "routers_loss": 0.0010108393616974354, + "skip_count": 1.0, + "step": 6032, + "text_loss": 0.43198078870773315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00042935787138700525, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 9736324.0, + "repeat_count": 2.0, + "routers_loss": 0.005443581845611334, + "skip_count": 2.0, + "step": 6034, + "text_loss": 0.24883155524730682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0004290514731627403, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 9739630.0, + "repeat_count": 1.0, + "routers_loss": 0.010645060800015926, + "skip_count": 2.0, + "step": 6036, + "text_loss": 0.24207182228565216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.0004287451021260846, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9742221.0, + "repeat_count": 0.0, + "routers_loss": 0.0008162845042534173, + "skip_count": 0.0, + "step": 6038, + "text_loss": 0.33018553256988525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004284387583944403, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9744925.0, + "repeat_count": 0.0, + "routers_loss": 0.003782407147809863, + "skip_count": 1.0, + "step": 6040, + "text_loss": 0.6600399613380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0004281324420851987, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9748103.0, + "repeat_count": 0.0, + "routers_loss": 0.0009834285592660308, + "skip_count": 0.0, + "step": 6042, + "text_loss": 0.6402350664138794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0004278261533157409, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9751128.0, + "repeat_count": 0.0, + "routers_loss": 0.004100334830582142, + "skip_count": 2.0, + "step": 6044, + "text_loss": 0.1545136719942093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0004275198922034372, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 9754140.0, + "repeat_count": 0.0, + "routers_loss": 0.0017166603356599808, + "skip_count": 1.0, + "step": 6046, + "text_loss": 0.5875935554504395 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042721365886564766, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9756945.0, + "repeat_count": 1.0, + "routers_loss": 0.00915827602148056, + "skip_count": 2.0, + "step": 6048, + "text_loss": 0.3885214328765869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00042690745341972134, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9759738.0, + "repeat_count": 0.0, + "routers_loss": 0.0057020667009055614, + "skip_count": 2.0, + "step": 6050, + "text_loss": 0.3107164204120636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00042660127598299647, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9762987.0, + "repeat_count": 0.0, + "routers_loss": 0.004196313209831715, + "skip_count": 2.0, + "step": 6052, + "text_loss": 0.3073577582836151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00042629512667280135, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 9765828.0, + "repeat_count": 0.0, + "routers_loss": 0.0023119752295315266, + "skip_count": 1.0, + "step": 6054, + "text_loss": 0.8228643536567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004259890056064527, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 9769129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021007524337619543, + "skip_count": 1.0, + "step": 6056, + "text_loss": 0.8334706425666809 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004256829129012568, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9771821.0, + "repeat_count": 1.0, + "routers_loss": 0.00671970471739769, + "skip_count": 2.0, + "step": 6058, + "text_loss": 0.17845536768436432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00042537684867450875, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9774566.0, + "repeat_count": 0.0, + "routers_loss": 0.0014770646812394261, + "skip_count": 0.0, + "step": 6060, + "text_loss": 0.4445459246635437 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.46022894041679, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00042507081304349315, + "loss": 0.0067, + "macro_f1": 0.5492662787437439, + "num_tokens": 9777909.0, + "repeat_count": 2.0, + "routers_loss": 0.014822427183389664, + "skip_count": 0.0, + "step": 6062, + "text_loss": 0.45526158809661865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004247648061254833, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9781159.0, + "repeat_count": 0.0, + "routers_loss": 0.00568385748192668, + "skip_count": 1.0, + "step": 6064, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.479013795127678, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00042445882803774173, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 9784960.0, + "repeat_count": 1.0, + "routers_loss": 0.0179694052785635, + "skip_count": 0.0, + "step": 6066, + "text_loss": 0.23591181635856628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00042415287889751966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9787941.0, + "repeat_count": 0.0, + "routers_loss": 0.0019039154285565019, + "skip_count": 0.0, + "step": 6068, + "text_loss": 0.9447930455207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004238469588220575, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9791096.0, + "repeat_count": 0.0, + "routers_loss": 0.004039563238620758, + "skip_count": 0.0, + "step": 6070, + "text_loss": 0.3134256601333618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00042354106792858446, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9794082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018352365586906672, + "skip_count": 0.0, + "step": 6072, + "text_loss": 0.5681536197662354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00042323520633431833, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9797303.0, + "repeat_count": 0.0, + "routers_loss": 0.0019325513858348131, + "skip_count": 0.0, + "step": 6074, + "text_loss": 0.2835809290409088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00042292937415646574, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9800435.0, + "repeat_count": 0.0, + "routers_loss": 0.002513401210308075, + "skip_count": 0.0, + "step": 6076, + "text_loss": 0.1931663602590561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00042262357151222265, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9803873.0, + "repeat_count": 0.0, + "routers_loss": 0.004864581860601902, + "skip_count": 0.0, + "step": 6078, + "text_loss": 0.25809767842292786 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004223177985187728, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9806438.0, + "repeat_count": 1.0, + "routers_loss": 0.004932792857289314, + "skip_count": 0.0, + "step": 6080, + "text_loss": 0.6409249305725098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00042201205529328925, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9809400.0, + "repeat_count": 0.0, + "routers_loss": 0.00590938376262784, + "skip_count": 1.0, + "step": 6082, + "text_loss": 0.31158050894737244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00042170634195293314, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9813246.0, + "repeat_count": 0.0, + "routers_loss": 0.006805860437452793, + "skip_count": 0.0, + "step": 6084, + "text_loss": 0.32945963740348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004214006586148545, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9816513.0, + "repeat_count": 0.0, + "routers_loss": 0.0010186503641307354, + "skip_count": 0.0, + "step": 6086, + "text_loss": 0.48659923672676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0004210950053961917, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9819908.0, + "repeat_count": 0.0, + "routers_loss": 0.00402973173186183, + "skip_count": 1.0, + "step": 6088, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00042078938241407174, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9822950.0, + "repeat_count": 0.0, + "routers_loss": 0.00236532068811357, + "skip_count": 1.0, + "step": 6090, + "text_loss": 0.26589256525039673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004204837897856098, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9826493.0, + "repeat_count": 1.0, + "routers_loss": 0.003072192659601569, + "skip_count": 2.0, + "step": 6092, + "text_loss": 0.5216912627220154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004201782276279096, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9829698.0, + "repeat_count": 0.0, + "routers_loss": 0.0027553171385079622, + "skip_count": 1.0, + "step": 6094, + "text_loss": 0.40127676725387573 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.61990020545935, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00041987269605806325, + "loss": 0.0045, + "macro_f1": 0.9442509412765503, + "num_tokens": 9833719.0, + "repeat_count": 4.0, + "routers_loss": 0.013845407404005527, + "skip_count": 4.0, + "step": 6096, + "text_loss": 0.23114071786403656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004195671951931509, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 9838235.0, + "repeat_count": 0.0, + "routers_loss": 0.0019887303933501244, + "skip_count": 2.0, + "step": 6098, + "text_loss": 0.7467341423034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004192617251502409, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9840867.0, + "repeat_count": 0.0, + "routers_loss": 0.0007213905337266624, + "skip_count": 0.0, + "step": 6100, + "text_loss": 0.6283472180366516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00041895628604639036, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9843827.0, + "repeat_count": 0.0, + "routers_loss": 0.003863139310851693, + "skip_count": 1.0, + "step": 6102, + "text_loss": 0.3602744936943054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00041865087799864374, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9846939.0, + "repeat_count": 0.0, + "routers_loss": 0.0013336286647245288, + "skip_count": 0.0, + "step": 6104, + "text_loss": 0.4182434678077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0004183455011240341, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 9849827.0, + "repeat_count": 0.0, + "routers_loss": 0.00038455065805464983, + "skip_count": 0.0, + "step": 6106, + "text_loss": 0.7122722864151001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 28.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004180401555395826, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 9853487.0, + "repeat_count": 3.0, + "routers_loss": 0.0038226440083235502, + "skip_count": 1.0, + "step": 6108, + "text_loss": 0.2521185576915741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004177348413622981, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9856321.0, + "repeat_count": 0.0, + "routers_loss": 0.0015809801407158375, + "skip_count": 0.0, + "step": 6110, + "text_loss": 0.423979252576828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004174295587091776, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9859238.0, + "repeat_count": 0.0, + "routers_loss": 0.0007586454739794135, + "skip_count": 0.0, + "step": 6112, + "text_loss": 0.4720100462436676 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00041712430769720593, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 9862282.0, + "repeat_count": 1.0, + "routers_loss": 0.0045816488564014435, + "skip_count": 1.0, + "step": 6114, + "text_loss": 0.279577374458313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004168190884433559, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 9865394.0, + "repeat_count": 1.0, + "routers_loss": 0.004728195257484913, + "skip_count": 1.0, + "step": 6116, + "text_loss": 0.3826395571231842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0004165139010645881, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9869165.0, + "repeat_count": 0.0, + "routers_loss": 0.006160226184874773, + "skip_count": 3.0, + "step": 6118, + "text_loss": 0.4668935537338257 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 28.732609333724685, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004162087456778509, + "loss": 0.0074, + "macro_f1": 0.9619450569152832, + "num_tokens": 9872381.0, + "repeat_count": 1.0, + "routers_loss": 0.027831824496388435, + "skip_count": 6.0, + "step": 6120, + "text_loss": 0.28708913922309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004159036224000804, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9875668.0, + "repeat_count": 0.0, + "routers_loss": 0.0030764432158321142, + "skip_count": 1.0, + "step": 6122, + "text_loss": 0.37078607082366943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004155985313482002, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9878533.0, + "repeat_count": 0.0, + "routers_loss": 0.00043521137558855116, + "skip_count": 0.0, + "step": 6124, + "text_loss": 0.34975379705429077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00041529347263912224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9881478.0, + "repeat_count": 0.0, + "routers_loss": 0.0016251741908490658, + "skip_count": 0.0, + "step": 6126, + "text_loss": 0.39166271686553955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00041498844638974535, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 9884252.0, + "repeat_count": 1.0, + "routers_loss": 0.019553523510694504, + "skip_count": 0.0, + "step": 6128, + "text_loss": 0.2309480905532837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004146834527169562, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9887485.0, + "repeat_count": 1.0, + "routers_loss": 0.0036251386627554893, + "skip_count": 0.0, + "step": 6130, + "text_loss": 0.4464457631111145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00041437849173762894, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9890711.0, + "repeat_count": 0.0, + "routers_loss": 0.0008515548543073237, + "skip_count": 0.0, + "step": 6132, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004140735635686251, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9894458.0, + "repeat_count": 1.0, + "routers_loss": 0.001084602321498096, + "skip_count": 0.0, + "step": 6134, + "text_loss": 0.32015663385391235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004137686683267938, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9897634.0, + "repeat_count": 0.0, + "routers_loss": 0.0025203595869243145, + "skip_count": 0.0, + "step": 6136, + "text_loss": 0.15804508328437805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0004134638061289715, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9901157.0, + "repeat_count": 0.0, + "routers_loss": 0.0029381231870502234, + "skip_count": 0.0, + "step": 6138, + "text_loss": 0.14375236630439758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0004131589770919819, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9903958.0, + "repeat_count": 0.0, + "routers_loss": 0.002789110178127885, + "skip_count": 0.0, + "step": 6140, + "text_loss": 0.2474033683538437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004128541813326361, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9906799.0, + "repeat_count": 2.0, + "routers_loss": 0.010770512744784355, + "skip_count": 3.0, + "step": 6142, + "text_loss": 0.2304249256849289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004125494189677325, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 9909286.0, + "repeat_count": 1.0, + "routers_loss": 0.003122122259810567, + "skip_count": 0.0, + "step": 6144, + "text_loss": 0.3781827688217163 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00041224469011405643, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9912416.0, + "repeat_count": 1.0, + "routers_loss": 0.008443298749625683, + "skip_count": 1.0, + "step": 6146, + "text_loss": 0.3004767596721649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004119399948883806, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9915290.0, + "repeat_count": 0.0, + "routers_loss": 0.0033219947945326567, + "skip_count": 1.0, + "step": 6148, + "text_loss": 0.748744547367096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0004116353334074647, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9918493.0, + "repeat_count": 1.0, + "routers_loss": 0.005501769948750734, + "skip_count": 0.0, + "step": 6150, + "text_loss": 0.330759733915329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.000411330705788056, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9921027.0, + "repeat_count": 0.0, + "routers_loss": 0.0013694261433556676, + "skip_count": 0.0, + "step": 6152, + "text_loss": 0.43070924282073975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.000411026112146888, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9924303.0, + "repeat_count": 0.0, + "routers_loss": 0.00046192589798010886, + "skip_count": 0.0, + "step": 6154, + "text_loss": 0.5674887895584106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004107215526006817, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9927065.0, + "repeat_count": 1.0, + "routers_loss": 0.004311304073780775, + "skip_count": 0.0, + "step": 6156, + "text_loss": 0.16138267517089844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004104170272661449, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9930713.0, + "repeat_count": 0.0, + "routers_loss": 0.0035845425445586443, + "skip_count": 0.0, + "step": 6158, + "text_loss": 0.18728356063365936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00041011253625997227, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9934393.0, + "repeat_count": 0.0, + "routers_loss": 0.00247366214171052, + "skip_count": 0.0, + "step": 6160, + "text_loss": 0.3624019920825958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004098080796988452, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9937457.0, + "repeat_count": 0.0, + "routers_loss": 0.003240241203457117, + "skip_count": 0.0, + "step": 6162, + "text_loss": 0.12348521500825882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0004095036576994321, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 9940523.0, + "repeat_count": 0.0, + "routers_loss": 0.001985874492675066, + "skip_count": 1.0, + "step": 6164, + "text_loss": 0.2688066363334656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00040919927037838815, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9943802.0, + "repeat_count": 0.0, + "routers_loss": 0.004264154937118292, + "skip_count": 3.0, + "step": 6166, + "text_loss": 0.49316367506980896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00040889491785235513, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9946649.0, + "repeat_count": 0.0, + "routers_loss": 0.002545441733673215, + "skip_count": 0.0, + "step": 6168, + "text_loss": 0.4079313576221466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004085906002379614, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9949800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009590961271896958, + "skip_count": 0.0, + "step": 6170, + "text_loss": 0.6166561245918274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004082863176518221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9954008.0, + "repeat_count": 0.0, + "routers_loss": 0.003795337164774537, + "skip_count": 2.0, + "step": 6172, + "text_loss": 0.4791361689567566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0004079820702105388, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9957153.0, + "repeat_count": 0.0, + "routers_loss": 0.0015634822193533182, + "skip_count": 0.0, + "step": 6174, + "text_loss": 0.7208777666091919 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.995597299677137, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004076778580306999, + "loss": 0.0056, + "macro_f1": 0.8820862174034119, + "num_tokens": 9960060.0, + "repeat_count": 2.0, + "routers_loss": 0.03223998099565506, + "skip_count": 2.0, + "step": 6176, + "text_loss": 0.6617992520332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00040737368122887983, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9963396.0, + "repeat_count": 0.0, + "routers_loss": 0.0033978577703237534, + "skip_count": 0.0, + "step": 6178, + "text_loss": 0.7339215278625488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00040706953992164, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9966364.0, + "repeat_count": 0.0, + "routers_loss": 0.0005358994239941239, + "skip_count": 0.0, + "step": 6180, + "text_loss": 0.44187214970588684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040676543422552767, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9969813.0, + "repeat_count": 0.0, + "routers_loss": 0.0018544091144576669, + "skip_count": 1.0, + "step": 6182, + "text_loss": 0.6244927048683167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004064613642570769, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9973015.0, + "repeat_count": 0.0, + "routers_loss": 0.005692692007869482, + "skip_count": 0.0, + "step": 6184, + "text_loss": 0.18860043585300446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00040615733013280784, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9976201.0, + "repeat_count": 0.0, + "routers_loss": 0.0018737476784735918, + "skip_count": 0.0, + "step": 6186, + "text_loss": 0.21189232170581818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00040585333196922687, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9979711.0, + "repeat_count": 0.0, + "routers_loss": 0.011945146135985851, + "skip_count": 2.0, + "step": 6188, + "text_loss": 0.2628154456615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00040554936988282663, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9983003.0, + "repeat_count": 0.0, + "routers_loss": 0.0036045778542757034, + "skip_count": 1.0, + "step": 6190, + "text_loss": 0.5926038026809692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0004052454439900861, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9986841.0, + "repeat_count": 0.0, + "routers_loss": 0.004170368425548077, + "skip_count": 0.0, + "step": 6192, + "text_loss": 0.3088737726211548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00040494155440747015, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9989596.0, + "repeat_count": 0.0, + "routers_loss": 0.002254750579595566, + "skip_count": 2.0, + "step": 6194, + "text_loss": 0.6309700012207031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.089228059876724, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00040463770125142987, + "loss": 0.0087, + "macro_f1": 0.8814815282821655, + "num_tokens": 9992789.0, + "repeat_count": 2.0, + "routers_loss": 0.04092822223901749, + "skip_count": 4.0, + "step": 6196, + "text_loss": 0.09625697880983353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00040433388463840213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 9995782.0, + "repeat_count": 0.0, + "routers_loss": 0.00029065192211419344, + "skip_count": 0.0, + "step": 6198, + "text_loss": 0.5600258111953735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004040301046848105, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9998712.0, + "repeat_count": 0.0, + "routers_loss": 0.0005865268758498132, + "skip_count": 0.0, + "step": 6200, + "text_loss": 0.6426429748535156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.11740534194306, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0283203125, + "learning_rate": 0.0004037263615070638, + "loss": 0.0078, + "macro_f1": 0.9265305995941162, + "num_tokens": 10002020.0, + "repeat_count": 1.0, + "routers_loss": 0.025357060134410858, + "skip_count": 3.0, + "step": 6202, + "text_loss": 0.25125735998153687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000403422655221557, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10005381.0, + "repeat_count": 0.0, + "routers_loss": 0.003139561740681529, + "skip_count": 1.0, + "step": 6204, + "text_loss": 0.3639419376850128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00040311898594467085, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10008348.0, + "repeat_count": 0.0, + "routers_loss": 0.004091196693480015, + "skip_count": 2.0, + "step": 6206, + "text_loss": 0.1602363884449005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040281535379277204, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10011171.0, + "repeat_count": 0.0, + "routers_loss": 0.005771483760327101, + "skip_count": 0.0, + "step": 6208, + "text_loss": 0.5593504905700684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000402511758882213, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10014374.0, + "repeat_count": 0.0, + "routers_loss": 0.005212264601141214, + "skip_count": 1.0, + "step": 6210, + "text_loss": 0.15668229758739471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004022082013293319, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10017327.0, + "repeat_count": 0.0, + "routers_loss": 0.0027585842180997133, + "skip_count": 1.0, + "step": 6212, + "text_loss": 0.21188466250896454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.173759906075727, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00040190468125045255, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10020518.0, + "repeat_count": 0.0, + "routers_loss": 0.013210589066147804, + "skip_count": 1.0, + "step": 6214, + "text_loss": 0.2551073729991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00040160119876188436, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10023799.0, + "repeat_count": 1.0, + "routers_loss": 0.001590219559147954, + "skip_count": 0.0, + "step": 6216, + "text_loss": 0.5634782314300537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004012977539799224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 10027107.0, + "repeat_count": 0.0, + "routers_loss": 0.003917343448847532, + "skip_count": 0.0, + "step": 6218, + "text_loss": 0.6412819027900696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004009943470208473, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 10030460.0, + "repeat_count": 0.0, + "routers_loss": 0.00874288845807314, + "skip_count": 2.0, + "step": 6220, + "text_loss": 0.13269923627376556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.000400690978000925, + "loss": 0.0075, + "macro_f1": 0.8817967176437378, + "num_tokens": 10034086.0, + "repeat_count": 2.0, + "routers_loss": 0.03736349940299988, + "skip_count": 3.0, + "step": 6222, + "text_loss": 0.4956454336643219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004003876470364075, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10037312.0, + "repeat_count": 0.0, + "routers_loss": 0.008481289260089397, + "skip_count": 2.0, + "step": 6224, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0004000843542435315, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 10040393.0, + "repeat_count": 0.0, + "routers_loss": 0.002235144842416048, + "skip_count": 0.0, + "step": 6226, + "text_loss": 0.17645306885242462 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003997810997385195, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10044386.0, + "repeat_count": 1.0, + "routers_loss": 0.004541373811662197, + "skip_count": 0.0, + "step": 6228, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00039947788363757915, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 10049046.0, + "repeat_count": 0.0, + "routers_loss": 0.0019183673430234194, + "skip_count": 1.0, + "step": 6230, + "text_loss": 0.6953724026679993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00039917470605690334, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 10051787.0, + "repeat_count": 2.0, + "routers_loss": 0.0032311067916452885, + "skip_count": 4.0, + "step": 6232, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.267684179630173, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00039887156711267043, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 10055396.0, + "repeat_count": 2.0, + "routers_loss": 0.03247373178601265, + "skip_count": 0.0, + "step": 6234, + "text_loss": 0.4239100515842438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00039856846692104363, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10058395.0, + "repeat_count": 0.0, + "routers_loss": 0.006287421099841595, + "skip_count": 3.0, + "step": 6236, + "text_loss": 0.24084535241127014 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.0003982654055981718, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10061302.0, + "repeat_count": 1.0, + "routers_loss": 0.0008686117362231016, + "skip_count": 1.0, + "step": 6238, + "text_loss": 0.4740419089794159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0003979623832601884, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10065318.0, + "repeat_count": 0.0, + "routers_loss": 0.0037686119321733713, + "skip_count": 2.0, + "step": 6240, + "text_loss": 0.43965795636177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0003976594000232123, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10068291.0, + "repeat_count": 0.0, + "routers_loss": 0.005804901942610741, + "skip_count": 0.0, + "step": 6242, + "text_loss": 0.24424348771572113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00039735645600334714, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10071645.0, + "repeat_count": 0.0, + "routers_loss": 0.002001055981963873, + "skip_count": 1.0, + "step": 6244, + "text_loss": 0.6524377465248108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0003970535513166815, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10075136.0, + "repeat_count": 0.0, + "routers_loss": 0.001252001617103815, + "skip_count": 0.0, + "step": 6246, + "text_loss": 0.22803714871406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0003967506860792893, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10078230.0, + "repeat_count": 0.0, + "routers_loss": 0.004913780372589827, + "skip_count": 1.0, + "step": 6248, + "text_loss": 0.9835516214370728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.000396447860407229, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10080852.0, + "repeat_count": 0.0, + "routers_loss": 0.0037437966093420982, + "skip_count": 2.0, + "step": 6250, + "text_loss": 0.4021640121936798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00039614507441654393, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10084139.0, + "repeat_count": 0.0, + "routers_loss": 0.005433002021163702, + "skip_count": 2.0, + "step": 6252, + "text_loss": 0.23060470819473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00039584232822326224, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10088501.0, + "repeat_count": 0.0, + "routers_loss": 0.0007705377647653222, + "skip_count": 0.0, + "step": 6254, + "text_loss": 0.5994830131530762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003955396219433969, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10091506.0, + "repeat_count": 0.0, + "routers_loss": 0.0012310115853324533, + "skip_count": 0.0, + "step": 6256, + "text_loss": 0.4639038145542145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0003952369556929455, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10096236.0, + "repeat_count": 0.0, + "routers_loss": 0.008964627049863338, + "skip_count": 2.0, + "step": 6258, + "text_loss": 0.24845287203788757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003949343295878903, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10099213.0, + "repeat_count": 0.0, + "routers_loss": 0.0033088945783674717, + "skip_count": 0.0, + "step": 6260, + "text_loss": 0.6527073979377747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00039463174374419817, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10103160.0, + "repeat_count": 2.0, + "routers_loss": 0.003462672932073474, + "skip_count": 1.0, + "step": 6262, + "text_loss": 0.4209299683570862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00039432919827782066, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10105881.0, + "repeat_count": 2.0, + "routers_loss": 0.0027124532498419285, + "skip_count": 2.0, + "step": 6264, + "text_loss": 0.4442266821861267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00039402669330469367, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10108596.0, + "repeat_count": 0.0, + "routers_loss": 0.005055282264947891, + "skip_count": 2.0, + "step": 6266, + "text_loss": 0.3331456780433655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00039372422894073765, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10111673.0, + "repeat_count": 0.0, + "routers_loss": 0.0009340311517007649, + "skip_count": 0.0, + "step": 6268, + "text_loss": 0.7664456367492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00039342180530185745, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10116141.0, + "repeat_count": 0.0, + "routers_loss": 0.00032052272581495345, + "skip_count": 0.0, + "step": 6270, + "text_loss": 0.47610244154930115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00039311942250394274, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10119151.0, + "repeat_count": 0.0, + "routers_loss": 0.0015820999396964908, + "skip_count": 0.0, + "step": 6272, + "text_loss": 0.3815282881259918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003928170806628669, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10122684.0, + "repeat_count": 0.0, + "routers_loss": 0.0007423736387863755, + "skip_count": 0.0, + "step": 6274, + "text_loss": 0.4630914628505707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00039251477989448797, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10126751.0, + "repeat_count": 0.0, + "routers_loss": 0.0006216703332029283, + "skip_count": 0.0, + "step": 6276, + "text_loss": 0.4342454671859741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.00039221252031464816, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10129784.0, + "repeat_count": 0.0, + "routers_loss": 0.004239698871970177, + "skip_count": 3.0, + "step": 6278, + "text_loss": 0.24661089479923248 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 29.4837100088054, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003919103020391738, + "loss": 0.006, + "macro_f1": 0.8803418874740601, + "num_tokens": 10133066.0, + "repeat_count": 2.0, + "routers_loss": 0.027879100292921066, + "skip_count": 7.0, + "step": 6280, + "text_loss": 0.4705188274383545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00039160812518387574, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 10136860.0, + "repeat_count": 0.0, + "routers_loss": 0.002533538034185767, + "skip_count": 0.0, + "step": 6282, + "text_loss": 0.1953880786895752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00039130598986454845, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 10140066.0, + "repeat_count": 1.0, + "routers_loss": 0.002462630858644843, + "skip_count": 2.0, + "step": 6284, + "text_loss": 0.378487765789032 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.000391003896196971, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 10143646.0, + "repeat_count": 1.0, + "routers_loss": 0.011922914534807205, + "skip_count": 1.0, + "step": 6286, + "text_loss": 0.2467316836118698 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00039070184429690607, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10146507.0, + "repeat_count": 1.0, + "routers_loss": 0.0059767309576272964, + "skip_count": 1.0, + "step": 6288, + "text_loss": 0.9603674411773682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003903998342801006, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10149301.0, + "repeat_count": 1.0, + "routers_loss": 0.0030056277755647898, + "skip_count": 2.0, + "step": 6290, + "text_loss": 0.36631715297698975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00039009786626228543, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10152158.0, + "repeat_count": 0.0, + "routers_loss": 0.005298118572682142, + "skip_count": 3.0, + "step": 6292, + "text_loss": 0.2876455783843994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003897959403591751, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 10155852.0, + "repeat_count": 0.0, + "routers_loss": 0.004937763791531324, + "skip_count": 2.0, + "step": 6294, + "text_loss": 0.14649681746959686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003894940566864683, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 10159164.0, + "repeat_count": 0.0, + "routers_loss": 0.0021474575623869896, + "skip_count": 0.0, + "step": 6296, + "text_loss": 0.5694304704666138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 29.568241855004402, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.00038919221535984753, + "loss": 0.0073, + "macro_f1": 0.875, + "num_tokens": 10161806.0, + "repeat_count": 1.0, + "routers_loss": 0.040340203791856766, + "skip_count": 3.0, + "step": 6298, + "text_loss": 0.1574537754058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038889041649497894, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10165669.0, + "repeat_count": 0.0, + "routers_loss": 0.0028486931696534157, + "skip_count": 0.0, + "step": 6300, + "text_loss": 0.9158071279525757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003885886602075123, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10168945.0, + "repeat_count": 0.0, + "routers_loss": 0.006565484683960676, + "skip_count": 2.0, + "step": 6302, + "text_loss": 0.3530846834182739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038828694661308116, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10171914.0, + "repeat_count": 0.0, + "routers_loss": 0.0009084723424166441, + "skip_count": 0.0, + "step": 6304, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0003879852758273029, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10175737.0, + "repeat_count": 1.0, + "routers_loss": 0.004121702630072832, + "skip_count": 2.0, + "step": 6306, + "text_loss": 0.5294032096862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00038768364796577814, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10178543.0, + "repeat_count": 0.0, + "routers_loss": 0.0013208909658715129, + "skip_count": 0.0, + "step": 6308, + "text_loss": 0.41084006428718567 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.62459641913707, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00038738206314409144, + "loss": 0.0079, + "macro_f1": 0.9247862696647644, + "num_tokens": 10181880.0, + "repeat_count": 3.0, + "routers_loss": 0.03674180060625076, + "skip_count": 6.0, + "step": 6310, + "text_loss": 0.6920746564865112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0003870805214778106, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10185173.0, + "repeat_count": 0.0, + "routers_loss": 0.00221974472515285, + "skip_count": 2.0, + "step": 6312, + "text_loss": 0.1376657634973526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003867790230824869, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10188642.0, + "repeat_count": 0.0, + "routers_loss": 0.001809283159673214, + "skip_count": 0.0, + "step": 6314, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003864775680736552, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10191750.0, + "repeat_count": 0.0, + "routers_loss": 0.0013956360053271055, + "skip_count": 0.0, + "step": 6316, + "text_loss": 0.4109838902950287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00038617615656683356, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10194578.0, + "repeat_count": 0.0, + "routers_loss": 0.002947692759335041, + "skip_count": 2.0, + "step": 6318, + "text_loss": 0.4818590581417084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003858747886775232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10197131.0, + "repeat_count": 0.0, + "routers_loss": 0.0008140999125316739, + "skip_count": 2.0, + "step": 6320, + "text_loss": 0.4004709720611572 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003855734645212093, + "loss": 0.0089, + "macro_f1": 0.8820862174034119, + "num_tokens": 10199965.0, + "repeat_count": 2.0, + "routers_loss": 0.013056626543402672, + "skip_count": 2.0, + "step": 6322, + "text_loss": 0.3367139995098114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00038527218421335977, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 10203184.0, + "repeat_count": 1.0, + "routers_loss": 0.0038112467154860497, + "skip_count": 2.0, + "step": 6324, + "text_loss": 0.5747989416122437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003849709478694255, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 10206436.0, + "repeat_count": 0.0, + "routers_loss": 0.001232540002092719, + "skip_count": 0.0, + "step": 6326, + "text_loss": 0.4981732964515686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00038466975560484115, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10209889.0, + "repeat_count": 0.0, + "routers_loss": 0.004343799781054258, + "skip_count": 0.0, + "step": 6328, + "text_loss": 0.2160186469554901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.000384368607535024, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10212520.0, + "repeat_count": 0.0, + "routers_loss": 0.0014161963481456041, + "skip_count": 1.0, + "step": 6330, + "text_loss": 0.3556232154369354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0003840675037753745, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10215456.0, + "repeat_count": 0.0, + "routers_loss": 0.0014989010524004698, + "skip_count": 0.0, + "step": 6332, + "text_loss": 0.8510926961898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003837664444412762, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10218558.0, + "repeat_count": 0.0, + "routers_loss": 0.006702739745378494, + "skip_count": 0.0, + "step": 6334, + "text_loss": 0.3995226323604584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0003834654296480958, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10221862.0, + "repeat_count": 0.0, + "routers_loss": 0.00826781615614891, + "skip_count": 2.0, + "step": 6336, + "text_loss": 0.3534671664237976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003831644595111825, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10224820.0, + "repeat_count": 0.0, + "routers_loss": 0.002143894787877798, + "skip_count": 0.0, + "step": 6338, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 29.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04736328125, + "learning_rate": 0.0003828635341458687, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 10227479.0, + "repeat_count": 0.0, + "routers_loss": 0.012319118715822697, + "skip_count": 2.0, + "step": 6340, + "text_loss": 0.26248639822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003825626536674697, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10231347.0, + "repeat_count": 0.0, + "routers_loss": 0.00334449321962893, + "skip_count": 0.0, + "step": 6342, + "text_loss": 0.6357201337814331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.000382261818191283, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10234347.0, + "repeat_count": 0.0, + "routers_loss": 0.0027788348961621523, + "skip_count": 0.0, + "step": 6344, + "text_loss": 0.2813846468925476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00038196102783258996, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10237105.0, + "repeat_count": 0.0, + "routers_loss": 0.001545077539049089, + "skip_count": 0.0, + "step": 6346, + "text_loss": 0.47612661123275757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0003816602827066537, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10240249.0, + "repeat_count": 0.0, + "routers_loss": 0.005602670833468437, + "skip_count": 2.0, + "step": 6348, + "text_loss": 0.18197228014469147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003813595829287204, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10243417.0, + "repeat_count": 0.0, + "routers_loss": 0.0004317959537729621, + "skip_count": 0.0, + "step": 6350, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0003810589286140186, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 10246824.0, + "repeat_count": 0.0, + "routers_loss": 0.002225276781246066, + "skip_count": 0.0, + "step": 6352, + "text_loss": 0.14129821956157684 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.831229820956853, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003807583198777599, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 10249836.0, + "repeat_count": 3.0, + "routers_loss": 0.02445496805012226, + "skip_count": 1.0, + "step": 6354, + "text_loss": 0.3237064480781555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00038045775683513786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10252900.0, + "repeat_count": 0.0, + "routers_loss": 0.0009264222462661564, + "skip_count": 0.0, + "step": 6356, + "text_loss": 0.6777551174163818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0003801572396013289, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10255526.0, + "repeat_count": 1.0, + "routers_loss": 0.007189550437033176, + "skip_count": 5.0, + "step": 6358, + "text_loss": 0.25438982248306274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00037985676829149187, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10258865.0, + "repeat_count": 0.0, + "routers_loss": 0.0014201018493622541, + "skip_count": 0.0, + "step": 6360, + "text_loss": 0.5063154101371765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0003795563430207678, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10261677.0, + "repeat_count": 0.0, + "routers_loss": 0.0035477925557643175, + "skip_count": 3.0, + "step": 6362, + "text_loss": 0.4815357029438019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.878191957734078, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003792559639042803, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 10264805.0, + "repeat_count": 0.0, + "routers_loss": 0.013723359443247318, + "skip_count": 1.0, + "step": 6364, + "text_loss": 0.5563676357269287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003789556310571351, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10267885.0, + "repeat_count": 0.0, + "routers_loss": 0.0028159532230347395, + "skip_count": 0.0, + "step": 6366, + "text_loss": 0.7284183502197266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003786553445944204, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10270934.0, + "repeat_count": 0.0, + "routers_loss": 0.0005918835522606969, + "skip_count": 0.0, + "step": 6368, + "text_loss": 0.7387746572494507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0003783551046312067, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10273818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416864581406116, + "skip_count": 0.0, + "step": 6370, + "text_loss": 0.5360285043716431 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037805491128254645, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 10276494.0, + "repeat_count": 2.0, + "routers_loss": 0.002382483799010515, + "skip_count": 1.0, + "step": 6372, + "text_loss": 0.7536854147911072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00037775476466347414, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10279719.0, + "repeat_count": 0.0, + "routers_loss": 0.0021104486659169197, + "skip_count": 1.0, + "step": 6374, + "text_loss": 0.6807253956794739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0003774546648890066, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 10283000.0, + "repeat_count": 0.0, + "routers_loss": 0.003148776013404131, + "skip_count": 2.0, + "step": 6376, + "text_loss": 0.30774110555648804 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003771546120741426, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10285666.0, + "repeat_count": 1.0, + "routers_loss": 0.007700880523771048, + "skip_count": 1.0, + "step": 6378, + "text_loss": 0.4476076364517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003768546063338631, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10289127.0, + "repeat_count": 0.0, + "routers_loss": 0.0023625255562365055, + "skip_count": 1.0, + "step": 6380, + "text_loss": 0.4350969195365906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0003765546477831307, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10292485.0, + "repeat_count": 0.0, + "routers_loss": 0.001428726245649159, + "skip_count": 0.0, + "step": 6382, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003762547365368902, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10295361.0, + "repeat_count": 0.0, + "routers_loss": 0.0027160397730767727, + "skip_count": 2.0, + "step": 6384, + "text_loss": 0.3476370573043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00037595487271006807, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10298717.0, + "repeat_count": 0.0, + "routers_loss": 0.002456068294122815, + "skip_count": 0.0, + "step": 6386, + "text_loss": 0.3634916841983795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.99090108599941, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.021240234375, + "learning_rate": 0.0003756550564175727, + "loss": 0.0049, + "macro_f1": 0.9265305995941162, + "num_tokens": 10302102.0, + "repeat_count": 1.0, + "routers_loss": 0.02546076290309429, + "skip_count": 3.0, + "step": 6388, + "text_loss": 0.2422582060098648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00037535528777429426, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10305060.0, + "repeat_count": 0.0, + "routers_loss": 0.001045907847583294, + "skip_count": 0.0, + "step": 6390, + "text_loss": 0.5563194155693054 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0003750555668951045, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10307903.0, + "repeat_count": 1.0, + "routers_loss": 0.007391332648694515, + "skip_count": 2.0, + "step": 6392, + "text_loss": 0.3423991799354553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00037475589389485744, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 10311396.0, + "repeat_count": 1.0, + "routers_loss": 0.0029360291082412004, + "skip_count": 1.0, + "step": 6394, + "text_loss": 0.9877024292945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00037445626888838807, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10314250.0, + "repeat_count": 0.0, + "routers_loss": 0.0014932662015780807, + "skip_count": 0.0, + "step": 6396, + "text_loss": 0.3978523313999176 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003741566919905133, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10316894.0, + "repeat_count": 1.0, + "routers_loss": 0.007003722712397575, + "skip_count": 5.0, + "step": 6398, + "text_loss": 0.2945566475391388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00037385716331603155, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10319603.0, + "repeat_count": 1.0, + "routers_loss": 0.006710570305585861, + "skip_count": 1.0, + "step": 6400, + "text_loss": 0.2984389662742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00037355768297972275, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10322670.0, + "repeat_count": 0.0, + "routers_loss": 0.00048738415353000164, + "skip_count": 0.0, + "step": 6402, + "text_loss": 0.483262300491333 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00037325825109634837, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10326280.0, + "repeat_count": 1.0, + "routers_loss": 0.001625525183044374, + "skip_count": 1.0, + "step": 6404, + "text_loss": 0.42678722739219666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0003729588677806513, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10329008.0, + "repeat_count": 0.0, + "routers_loss": 0.004408636130392551, + "skip_count": 0.0, + "step": 6406, + "text_loss": 0.2264070063829422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0003726595331473557, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10332533.0, + "repeat_count": 0.0, + "routers_loss": 0.0038099216762930155, + "skip_count": 2.0, + "step": 6408, + "text_loss": 0.6670092940330505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003723602473111672, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10335643.0, + "repeat_count": 1.0, + "routers_loss": 0.003097689710557461, + "skip_count": 0.0, + "step": 6410, + "text_loss": 0.45228812098503113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037206101038677274, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10338522.0, + "repeat_count": 0.0, + "routers_loss": 0.005268602631986141, + "skip_count": 1.0, + "step": 6412, + "text_loss": 0.7288079857826233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003717618224888405, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 10341516.0, + "repeat_count": 0.0, + "routers_loss": 0.004640138708055019, + "skip_count": 2.0, + "step": 6414, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00037146268373201954, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10344831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006379318656399846, + "skip_count": 0.0, + "step": 6416, + "text_loss": 0.7864460945129395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003711635942309408, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10348499.0, + "repeat_count": 0.0, + "routers_loss": 0.0004005273221991956, + "skip_count": 0.0, + "step": 6418, + "text_loss": 0.605839192867279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0003708645541002159, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10351722.0, + "repeat_count": 0.0, + "routers_loss": 0.001061634044162929, + "skip_count": 0.0, + "step": 6420, + "text_loss": 0.8226510286331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 30.150278837687114, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003705655634544374, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 10355275.0, + "repeat_count": 0.0, + "routers_loss": 0.013980664312839508, + "skip_count": 2.0, + "step": 6422, + "text_loss": 0.2709597647190094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003702666224081792, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10359702.0, + "repeat_count": 1.0, + "routers_loss": 0.0013196271611377597, + "skip_count": 0.0, + "step": 6424, + "text_loss": 0.6451483368873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00036996773107599604, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10363364.0, + "repeat_count": 0.0, + "routers_loss": 0.0028023163322359324, + "skip_count": 1.0, + "step": 6426, + "text_loss": 0.2770799398422241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0003696688895724235, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10366554.0, + "repeat_count": 0.0, + "routers_loss": 0.0011023655533790588, + "skip_count": 0.0, + "step": 6428, + "text_loss": 0.5466503500938416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0003693700980119784, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10369733.0, + "repeat_count": 0.0, + "routers_loss": 0.00230707717128098, + "skip_count": 0.0, + "step": 6430, + "text_loss": 0.45667049288749695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036907135650915824, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10373382.0, + "repeat_count": 0.0, + "routers_loss": 0.0036784098483622074, + "skip_count": 2.0, + "step": 6432, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00036877266517844115, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10376202.0, + "repeat_count": 0.0, + "routers_loss": 0.0008461157558485866, + "skip_count": 0.0, + "step": 6434, + "text_loss": 0.27238601446151733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0003684740241342863, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10380748.0, + "repeat_count": 0.0, + "routers_loss": 0.0052765593864023685, + "skip_count": 0.0, + "step": 6436, + "text_loss": 0.6182295083999634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00036817543349113355, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10386148.0, + "repeat_count": 1.0, + "routers_loss": 0.005562922917306423, + "skip_count": 2.0, + "step": 6438, + "text_loss": 0.5591027140617371 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003678768933634033, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10389385.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686366491019726, + "skip_count": 0.0, + "step": 6440, + "text_loss": 0.5158660411834717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003675784038654968, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10391893.0, + "repeat_count": 0.0, + "routers_loss": 0.0022222092375159264, + "skip_count": 1.0, + "step": 6442, + "text_loss": 0.2865697741508484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003672799651117958, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 10395082.0, + "repeat_count": 0.0, + "routers_loss": 0.0030799773521721363, + "skip_count": 2.0, + "step": 6444, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003669815772166625, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10398015.0, + "repeat_count": 0.0, + "routers_loss": 0.0035721305757761, + "skip_count": 3.0, + "step": 6446, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00036668324029443975, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10400749.0, + "repeat_count": 0.0, + "routers_loss": 0.00741040613502264, + "skip_count": 4.0, + "step": 6448, + "text_loss": 0.3922366201877594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0003663849544594507, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 10404439.0, + "repeat_count": 0.0, + "routers_loss": 0.002974750241264701, + "skip_count": 2.0, + "step": 6450, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.00036608671982599927, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10408476.0, + "repeat_count": 0.0, + "routers_loss": 0.004810616374015808, + "skip_count": 0.0, + "step": 6452, + "text_loss": 0.3928622305393219 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003657885365083694, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10411533.0, + "repeat_count": 1.0, + "routers_loss": 0.005527745466679335, + "skip_count": 0.0, + "step": 6454, + "text_loss": 0.22816279530525208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.00036549040462082556, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10414501.0, + "repeat_count": 0.0, + "routers_loss": 0.0021297158673405647, + "skip_count": 0.0, + "step": 6456, + "text_loss": 0.20487719774246216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 30.31934253008512, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003651923242776124, + "loss": 0.0082, + "macro_f1": 0.6592592597007751, + "num_tokens": 10418296.0, + "repeat_count": 1.0, + "routers_loss": 0.046412210911512375, + "skip_count": 5.0, + "step": 6458, + "text_loss": 0.2890419065952301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00036489429559295484, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10421211.0, + "repeat_count": 0.0, + "routers_loss": 0.004002603702247143, + "skip_count": 0.0, + "step": 6460, + "text_loss": 0.23165544867515564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003645963186810581, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 10424231.0, + "repeat_count": 0.0, + "routers_loss": 0.003480088198557496, + "skip_count": 1.0, + "step": 6462, + "text_loss": 0.6286683082580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003642983936561075, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10427387.0, + "repeat_count": 0.0, + "routers_loss": 0.009358933195471764, + "skip_count": 2.0, + "step": 6464, + "text_loss": 0.3258316218852997 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.356912239506897, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00036400052063226816, + "loss": 0.0048, + "macro_f1": 0.9539539813995361, + "num_tokens": 10430813.0, + "repeat_count": 5.0, + "routers_loss": 0.03567950055003166, + "skip_count": 5.0, + "step": 6466, + "text_loss": 0.7278715968132019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036370269972368615, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 10434175.0, + "repeat_count": 1.0, + "routers_loss": 0.00226925453171134, + "skip_count": 2.0, + "step": 6468, + "text_loss": 0.5652450919151306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0003634049310444867, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10437393.0, + "repeat_count": 0.0, + "routers_loss": 0.0013644809368997812, + "skip_count": 0.0, + "step": 6470, + "text_loss": 0.5985191464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003631072147087753, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 10440412.0, + "repeat_count": 0.0, + "routers_loss": 0.0003114990540780127, + "skip_count": 0.0, + "step": 6472, + "text_loss": 0.5588209629058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00036280955083063747, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10443471.0, + "repeat_count": 0.0, + "routers_loss": 0.0005486322334036231, + "skip_count": 0.0, + "step": 6474, + "text_loss": 0.6969016194343567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00036251193952413865, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10446548.0, + "repeat_count": 1.0, + "routers_loss": 0.008256378583610058, + "skip_count": 2.0, + "step": 6476, + "text_loss": 0.27083566784858704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0003622143809033239, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10449478.0, + "repeat_count": 0.0, + "routers_loss": 0.001008771825581789, + "skip_count": 0.0, + "step": 6478, + "text_loss": 0.1689433604478836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00036191687508221827, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10453017.0, + "repeat_count": 1.0, + "routers_loss": 0.0014678959269076586, + "skip_count": 0.0, + "step": 6480, + "text_loss": 0.9571998715400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0003616194221748267, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10456061.0, + "repeat_count": 0.0, + "routers_loss": 0.001516164978966117, + "skip_count": 0.0, + "step": 6482, + "text_loss": 0.5750429034233093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0003613220222951335, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10459130.0, + "repeat_count": 0.0, + "routers_loss": 0.0031315975356847048, + "skip_count": 0.0, + "step": 6484, + "text_loss": 0.47120073437690735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0003610246755571029, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10462190.0, + "repeat_count": 0.0, + "routers_loss": 0.0006079549202695489, + "skip_count": 0.0, + "step": 6486, + "text_loss": 0.8426173329353333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000360727382074679, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10465233.0, + "repeat_count": 0.0, + "routers_loss": 0.00596054969355464, + "skip_count": 0.0, + "step": 6488, + "text_loss": 0.18435880541801453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.469621367772234, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00036043014196178463, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 10468135.0, + "repeat_count": 0.0, + "routers_loss": 0.008584967814385891, + "skip_count": 1.0, + "step": 6490, + "text_loss": 0.3827758729457855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00036013295533232344, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10471032.0, + "repeat_count": 2.0, + "routers_loss": 0.005076571833342314, + "skip_count": 5.0, + "step": 6492, + "text_loss": 0.1215854063630104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 30.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003598358223001776, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10474779.0, + "repeat_count": 3.0, + "routers_loss": 0.005972118582576513, + "skip_count": 0.0, + "step": 6494, + "text_loss": 0.22768665850162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003595387429792091, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10478015.0, + "repeat_count": 0.0, + "routers_loss": 0.004733685404062271, + "skip_count": 1.0, + "step": 6496, + "text_loss": 0.5013535618782043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00035924171748325916, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10481113.0, + "repeat_count": 0.0, + "routers_loss": 0.01148980576545, + "skip_count": 2.0, + "step": 6498, + "text_loss": 0.3281762897968292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003589447459261487, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10484049.0, + "repeat_count": 0.0, + "routers_loss": 0.007726775947958231, + "skip_count": 2.0, + "step": 6500, + "text_loss": 0.46294569969177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00035864782842167763, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10487443.0, + "repeat_count": 1.0, + "routers_loss": 0.0013331319205462933, + "skip_count": 0.0, + "step": 6502, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00035835096508362544, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10490535.0, + "repeat_count": 0.0, + "routers_loss": 0.0011629529763013124, + "skip_count": 0.0, + "step": 6504, + "text_loss": 0.40683525800704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00035805415602575054, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10493575.0, + "repeat_count": 0.0, + "routers_loss": 0.004780632443726063, + "skip_count": 0.0, + "step": 6506, + "text_loss": 0.37263134121894836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00035775740136179075, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10496193.0, + "repeat_count": 0.0, + "routers_loss": 0.0018355643842369318, + "skip_count": 0.0, + "step": 6508, + "text_loss": 0.2074306458234787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00035746070120546314, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10500135.0, + "repeat_count": 0.0, + "routers_loss": 0.004067617934197187, + "skip_count": 1.0, + "step": 6510, + "text_loss": 0.26313406229019165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00035716405567046383, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10503533.0, + "repeat_count": 0.0, + "routers_loss": 0.005438363179564476, + "skip_count": 0.0, + "step": 6512, + "text_loss": 0.3448122441768646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00035686746487046767, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 10506207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012895528925582767, + "skip_count": 0.0, + "step": 6514, + "text_loss": 0.43096476793289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003565709289191291, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10509257.0, + "repeat_count": 0.0, + "routers_loss": 0.003141741268336773, + "skip_count": 0.0, + "step": 6516, + "text_loss": 0.22349724173545837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003562744479300811, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10512554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005669888923875988, + "skip_count": 0.0, + "step": 6518, + "text_loss": 0.5319190621376038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00035597802201693587, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10515720.0, + "repeat_count": 0.0, + "routers_loss": 0.0020814717281609774, + "skip_count": 0.0, + "step": 6520, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003556816512932841, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10518517.0, + "repeat_count": 2.0, + "routers_loss": 0.010716461576521397, + "skip_count": 3.0, + "step": 6522, + "text_loss": 0.15843836963176727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0003553853358726959, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10521414.0, + "repeat_count": 0.0, + "routers_loss": 0.0014748790999874473, + "skip_count": 0.0, + "step": 6524, + "text_loss": 0.393892377614975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00035508907586871984, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10524210.0, + "repeat_count": 0.0, + "routers_loss": 0.0004757299611810595, + "skip_count": 0.0, + "step": 6526, + "text_loss": 0.2557907700538635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00035479287139488327, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10527327.0, + "repeat_count": 1.0, + "routers_loss": 0.002445317106321454, + "skip_count": 0.0, + "step": 6528, + "text_loss": 0.48338422179222107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003544967225646922, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10530363.0, + "repeat_count": 0.0, + "routers_loss": 0.0015845977468416095, + "skip_count": 0.0, + "step": 6530, + "text_loss": 0.6474354267120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00035420062949163166, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10533444.0, + "repeat_count": 0.0, + "routers_loss": 0.002190655330196023, + "skip_count": 0.0, + "step": 6532, + "text_loss": 0.3789777457714081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0003539045922891649, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10536711.0, + "repeat_count": 0.0, + "routers_loss": 0.00317079434171319, + "skip_count": 0.0, + "step": 6534, + "text_loss": 0.25758084654808044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00035360861107073394, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 10539849.0, + "repeat_count": 0.0, + "routers_loss": 0.0010938458144664764, + "skip_count": 0.0, + "step": 6536, + "text_loss": 0.9821014404296875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003533126859497592, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10543004.0, + "repeat_count": 0.0, + "routers_loss": 0.003071998478844762, + "skip_count": 2.0, + "step": 6538, + "text_loss": 0.6314182281494141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003530168170396401, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10545965.0, + "repeat_count": 0.0, + "routers_loss": 0.006067665759474039, + "skip_count": 2.0, + "step": 6540, + "text_loss": 0.5021927356719971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.000352721004453754, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10549188.0, + "repeat_count": 0.0, + "routers_loss": 0.0019109295681118965, + "skip_count": 0.0, + "step": 6542, + "text_loss": 0.3008780777454376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00035242524830545683, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10552298.0, + "repeat_count": 0.0, + "routers_loss": 0.007457790896296501, + "skip_count": 3.0, + "step": 6544, + "text_loss": 0.5675695538520813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003521295487080829, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 10555123.0, + "repeat_count": 0.0, + "routers_loss": 0.007243642583489418, + "skip_count": 1.0, + "step": 6546, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00035183390577494476, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10559653.0, + "repeat_count": 0.0, + "routers_loss": 0.004024330526590347, + "skip_count": 0.0, + "step": 6548, + "text_loss": 0.2634682357311249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 0.0003515383196193336, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10563770.0, + "repeat_count": 1.0, + "routers_loss": 0.010837121866643429, + "skip_count": 0.0, + "step": 6550, + "text_loss": 0.1608252227306366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0003512427903545183, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10567117.0, + "repeat_count": 0.0, + "routers_loss": 0.003473864868283272, + "skip_count": 0.0, + "step": 6552, + "text_loss": 0.231611430644989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0003509473180937464, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10570622.0, + "repeat_count": 0.0, + "routers_loss": 0.004441239405423403, + "skip_count": 1.0, + "step": 6554, + "text_loss": 0.3193909227848053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506519029502433, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10573411.0, + "repeat_count": 0.0, + "routers_loss": 0.0008821079391054809, + "skip_count": 0.0, + "step": 6556, + "text_loss": 0.4478783905506134 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0003503565450372128, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10576422.0, + "repeat_count": 1.0, + "routers_loss": 0.0014448441797867417, + "skip_count": 0.0, + "step": 6558, + "text_loss": 0.46065983176231384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003500612444678365, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10579879.0, + "repeat_count": 0.0, + "routers_loss": 0.007939066737890244, + "skip_count": 1.0, + "step": 6560, + "text_loss": 0.3299395740032196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000349766001355274, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10583067.0, + "repeat_count": 0.0, + "routers_loss": 0.010073966346681118, + "skip_count": 2.0, + "step": 6562, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00034947081581266335, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10586276.0, + "repeat_count": 0.0, + "routers_loss": 0.0062315030954778194, + "skip_count": 1.0, + "step": 6564, + "text_loss": 0.22706018388271332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003491756879531201, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10589257.0, + "repeat_count": 3.0, + "routers_loss": 0.0023778853937983513, + "skip_count": 4.0, + "step": 6566, + "text_loss": 0.5567800998687744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003488806178897377, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10592163.0, + "repeat_count": 0.0, + "routers_loss": 0.0004184350254945457, + "skip_count": 0.0, + "step": 6568, + "text_loss": 0.4027897119522095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003485856057355876, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 10595326.0, + "repeat_count": 0.0, + "routers_loss": 0.0035254736430943012, + "skip_count": 1.0, + "step": 6570, + "text_loss": 0.3044572174549103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000348290651603719, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10598236.0, + "repeat_count": 0.0, + "routers_loss": 0.0030894684605300426, + "skip_count": 0.0, + "step": 6572, + "text_loss": 0.23021161556243896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00034799575560715896, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10601653.0, + "repeat_count": 1.0, + "routers_loss": 0.0036557347048074007, + "skip_count": 0.0, + "step": 6574, + "text_loss": 0.5437754392623901 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003477009178589121, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10604581.0, + "repeat_count": 2.0, + "routers_loss": 0.021344119682908058, + "skip_count": 4.0, + "step": 6576, + "text_loss": 0.29078927636146545 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0003474061384719608, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10607676.0, + "repeat_count": 1.0, + "routers_loss": 0.0037169242277741432, + "skip_count": 1.0, + "step": 6578, + "text_loss": 1.1790896654129028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003471114175592649, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10611269.0, + "repeat_count": 2.0, + "routers_loss": 0.005873420741409063, + "skip_count": 4.0, + "step": 6580, + "text_loss": 0.36204129457473755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0003468167552337624, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 10614335.0, + "repeat_count": 1.0, + "routers_loss": 0.01030842587351799, + "skip_count": 2.0, + "step": 6582, + "text_loss": 0.20400437712669373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.00034652215160836826, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10617565.0, + "repeat_count": 0.0, + "routers_loss": 0.0025721401907503605, + "skip_count": 0.0, + "step": 6584, + "text_loss": 0.44676345586776733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00034622760679597507, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10620706.0, + "repeat_count": 0.0, + "routers_loss": 0.005751762073487043, + "skip_count": 1.0, + "step": 6586, + "text_loss": 0.4733653664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00034593312090945306, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10623916.0, + "repeat_count": 0.0, + "routers_loss": 0.0029759553726762533, + "skip_count": 3.0, + "step": 6588, + "text_loss": 0.49876922369003296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003456386940616498, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10628093.0, + "repeat_count": 0.0, + "routers_loss": 0.0010031822603195906, + "skip_count": 0.0, + "step": 6590, + "text_loss": 0.42708611488342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00034534432636539004, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10631739.0, + "repeat_count": 0.0, + "routers_loss": 0.0014793311711400747, + "skip_count": 0.0, + "step": 6592, + "text_loss": 0.18193726241588593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003450500179334762, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10634862.0, + "repeat_count": 0.0, + "routers_loss": 0.0059733521193265915, + "skip_count": 2.0, + "step": 6594, + "text_loss": 0.28596529364585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003447557688786879, + "loss": 0.0043, + "macro_f1": 0.3272727429866791, + "num_tokens": 10637758.0, + "repeat_count": 0.0, + "routers_loss": 0.0076768649742007256, + "skip_count": 1.0, + "step": 6596, + "text_loss": 0.39428210258483887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00034446157931378185, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10640440.0, + "repeat_count": 0.0, + "routers_loss": 0.0015128811355680227, + "skip_count": 0.0, + "step": 6598, + "text_loss": 0.45584383606910706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00034416744935149193, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10643600.0, + "repeat_count": 0.0, + "routers_loss": 0.000757391273509711, + "skip_count": 0.0, + "step": 6600, + "text_loss": 0.503209114074707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003438733791045294, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10646907.0, + "repeat_count": 0.0, + "routers_loss": 0.0025944956578314304, + "skip_count": 2.0, + "step": 6602, + "text_loss": 0.4370735287666321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00034357936868558255, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10649995.0, + "repeat_count": 0.0, + "routers_loss": 0.0006543452036567032, + "skip_count": 0.0, + "step": 6604, + "text_loss": 0.4125586748123169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00034328541820731663, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10653251.0, + "repeat_count": 0.0, + "routers_loss": 0.00027016724925488234, + "skip_count": 1.0, + "step": 6606, + "text_loss": 0.7309898734092712 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.023481068388612, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.020751953125, + "learning_rate": 0.00034299152778237413, + "loss": 0.0062, + "macro_f1": 0.8823530077934265, + "num_tokens": 10657229.0, + "repeat_count": 1.0, + "routers_loss": 0.01905548945069313, + "skip_count": 2.0, + "step": 6608, + "text_loss": 0.42367079854011536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0003426976975233744, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10660524.0, + "repeat_count": 0.0, + "routers_loss": 0.0004718089767266065, + "skip_count": 0.0, + "step": 6610, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00034240392754291343, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10663908.0, + "repeat_count": 1.0, + "routers_loss": 0.0027069442439824343, + "skip_count": 0.0, + "step": 6612, + "text_loss": 0.859471321105957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000342110217953565, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10667814.0, + "repeat_count": 0.0, + "routers_loss": 0.0015497280983254313, + "skip_count": 0.0, + "step": 6614, + "text_loss": 0.18337638676166534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003418165688678788, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10671630.0, + "repeat_count": 0.0, + "routers_loss": 0.0013396464055404067, + "skip_count": 0.0, + "step": 6616, + "text_loss": 0.860016405582428 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003415229803983819, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10675308.0, + "repeat_count": 0.0, + "routers_loss": 0.007542039267718792, + "skip_count": 3.0, + "step": 6618, + "text_loss": 0.15481022000312805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003412294526575779, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10678092.0, + "repeat_count": 0.0, + "routers_loss": 0.002029839437454939, + "skip_count": 2.0, + "step": 6620, + "text_loss": 0.5121933221817017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00034093598575794706, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10681382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013001341139897704, + "skip_count": 0.0, + "step": 6622, + "text_loss": 0.4555061161518097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00034064257981194655, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10684255.0, + "repeat_count": 0.0, + "routers_loss": 0.0007926415419206023, + "skip_count": 0.0, + "step": 6624, + "text_loss": 0.7298227548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003403492349320101, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 10686904.0, + "repeat_count": 0.0, + "routers_loss": 0.0021080176811665297, + "skip_count": 1.0, + "step": 6626, + "text_loss": 0.45434215664863586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.000340055951230548, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10690311.0, + "repeat_count": 0.0, + "routers_loss": 0.004011874087154865, + "skip_count": 0.0, + "step": 6628, + "text_loss": 0.15496443212032318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00033976272881994707, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10693395.0, + "repeat_count": 0.0, + "routers_loss": 0.0031893099658191204, + "skip_count": 2.0, + "step": 6630, + "text_loss": 0.5291517972946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003394695678125708, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 10697046.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124347683042288, + "skip_count": 1.0, + "step": 6632, + "text_loss": 0.2893230617046356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033917646832075886, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10700111.0, + "repeat_count": 0.0, + "routers_loss": 0.002547801472246647, + "skip_count": 0.0, + "step": 6634, + "text_loss": 0.10363512486219406 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 31.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003388834304568275, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 10703939.0, + "repeat_count": 2.0, + "routers_loss": 0.0019040531478822231, + "skip_count": 0.0, + "step": 6636, + "text_loss": 0.5185034275054932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00033859045433306975, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 10707187.0, + "repeat_count": 0.0, + "routers_loss": 0.0074104927480220795, + "skip_count": 2.0, + "step": 6638, + "text_loss": 0.1618153154850006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0003382975400617543, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10710029.0, + "repeat_count": 0.0, + "routers_loss": 0.0013861875049769878, + "skip_count": 1.0, + "step": 6640, + "text_loss": 0.6674485206604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003380046877551266, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10713318.0, + "repeat_count": 0.0, + "routers_loss": 0.0034452753607183695, + "skip_count": 0.0, + "step": 6642, + "text_loss": 0.39299124479293823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003377118975254082, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10716130.0, + "repeat_count": 0.0, + "routers_loss": 0.006802885327488184, + "skip_count": 2.0, + "step": 6644, + "text_loss": 0.12942606210708618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.20193718814206, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003374191694847968, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 10719400.0, + "repeat_count": 1.0, + "routers_loss": 0.03718209266662598, + "skip_count": 2.0, + "step": 6646, + "text_loss": 0.34327754378318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0003371265037454663, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10722108.0, + "repeat_count": 0.0, + "routers_loss": 0.006016947794705629, + "skip_count": 2.0, + "step": 6648, + "text_loss": 0.15644726157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.220722042852948, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00033683390041956663, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 10725709.0, + "repeat_count": 1.0, + "routers_loss": 0.04308273270726204, + "skip_count": 2.0, + "step": 6650, + "text_loss": 0.1875772923231125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 31.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003365413596192243, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 10728717.0, + "repeat_count": 2.0, + "routers_loss": 0.006372809875756502, + "skip_count": 1.0, + "step": 6652, + "text_loss": 0.4948291778564453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00033624888145654137, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10732082.0, + "repeat_count": 0.0, + "routers_loss": 0.0014530479675158858, + "skip_count": 0.0, + "step": 6654, + "text_loss": 0.44932305812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00033595646604359585, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10734663.0, + "repeat_count": 0.0, + "routers_loss": 0.001924810465425253, + "skip_count": 0.0, + "step": 6656, + "text_loss": 0.45626893639564514 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00033566411349244206, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10737470.0, + "repeat_count": 1.0, + "routers_loss": 0.0040014320984482765, + "skip_count": 0.0, + "step": 6658, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00033537182391510996, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10740228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008573737577535212, + "skip_count": 0.0, + "step": 6660, + "text_loss": 0.5626822113990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003350795974236055, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10742883.0, + "repeat_count": 0.0, + "routers_loss": 0.011166860349476337, + "skip_count": 1.0, + "step": 6662, + "text_loss": 0.23357805609703064 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 31.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00033478743412991037, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10746459.0, + "repeat_count": 1.0, + "routers_loss": 0.01719980500638485, + "skip_count": 6.0, + "step": 6664, + "text_loss": 0.150017648935318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033449533414598223, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 10749984.0, + "repeat_count": 0.0, + "routers_loss": 0.0038280142471194267, + "skip_count": 2.0, + "step": 6666, + "text_loss": 0.6312657594680786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033420329758375423, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 10752792.0, + "repeat_count": 0.0, + "routers_loss": 0.0007688060286454856, + "skip_count": 1.0, + "step": 6668, + "text_loss": 0.6794863939285278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00033391132455513537, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10756125.0, + "repeat_count": 0.0, + "routers_loss": 0.003196930279955268, + "skip_count": 2.0, + "step": 6670, + "text_loss": 0.22897565364837646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003336194151720102, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10759296.0, + "repeat_count": 0.0, + "routers_loss": 0.0026212623342871666, + "skip_count": 0.0, + "step": 6672, + "text_loss": 0.5236268639564514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003333275695462391, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10762574.0, + "repeat_count": 0.0, + "routers_loss": 0.007855101488530636, + "skip_count": 2.0, + "step": 6674, + "text_loss": 0.2971038818359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003330357877896577, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10765758.0, + "repeat_count": 0.0, + "routers_loss": 0.004191791173070669, + "skip_count": 2.0, + "step": 6676, + "text_loss": 0.17358586192131042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0003327440700140774, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10769396.0, + "repeat_count": 0.0, + "routers_loss": 0.004101858474314213, + "skip_count": 1.0, + "step": 6678, + "text_loss": 0.28932204842567444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.000332452416331285, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10772605.0, + "repeat_count": 0.0, + "routers_loss": 0.0008305918308906257, + "skip_count": 0.0, + "step": 6680, + "text_loss": 0.47090092301368713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0003321608268530427, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10776576.0, + "repeat_count": 0.0, + "routers_loss": 0.003022305201739073, + "skip_count": 1.0, + "step": 6682, + "text_loss": 0.4467788338661194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033186930169108795, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10779648.0, + "repeat_count": 1.0, + "routers_loss": 0.0021474999375641346, + "skip_count": 0.0, + "step": 6684, + "text_loss": 0.6249470710754395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.00033157784095713417, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 10782665.0, + "repeat_count": 0.0, + "routers_loss": 0.0025120675563812256, + "skip_count": 1.0, + "step": 6686, + "text_loss": 0.6763803958892822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003312864447628695, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10785789.0, + "repeat_count": 0.0, + "routers_loss": 0.0013111691223457456, + "skip_count": 1.0, + "step": 6688, + "text_loss": 0.6609058380126953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00033099511321995744, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 10788846.0, + "repeat_count": 0.0, + "routers_loss": 0.0012354454956948757, + "skip_count": 0.0, + "step": 6690, + "text_loss": 0.4421829283237457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0003307038464400368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10791611.0, + "repeat_count": 0.0, + "routers_loss": 0.0035219944547861814, + "skip_count": 2.0, + "step": 6692, + "text_loss": 0.16222824156284332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00033041264453472153, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10794868.0, + "repeat_count": 1.0, + "routers_loss": 0.0007216202793642879, + "skip_count": 0.0, + "step": 6694, + "text_loss": 0.37388721108436584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 31.436747872028178, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0003301215076156008, + "loss": 0.0063, + "macro_f1": 0.8803418874740601, + "num_tokens": 10797737.0, + "repeat_count": 2.0, + "routers_loss": 0.025403080508112907, + "skip_count": 7.0, + "step": 6696, + "text_loss": 0.5086690187454224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003298304357942389, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10800972.0, + "repeat_count": 0.0, + "routers_loss": 0.010532539337873459, + "skip_count": 2.0, + "step": 6698, + "text_loss": 0.22500646114349365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00032953942918217494, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10803654.0, + "repeat_count": 0.0, + "routers_loss": 0.0009591903653927147, + "skip_count": 0.0, + "step": 6700, + "text_loss": 0.6256277561187744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003292484878909232, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10807506.0, + "repeat_count": 0.0, + "routers_loss": 0.003801517654210329, + "skip_count": 2.0, + "step": 6702, + "text_loss": 0.522081196308136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00032895761203197317, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 10810163.0, + "repeat_count": 0.0, + "routers_loss": 0.002608039416372776, + "skip_count": 2.0, + "step": 6704, + "text_loss": 0.3600201904773712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00032866680171678874, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10813202.0, + "repeat_count": 0.0, + "routers_loss": 0.0026464913971722126, + "skip_count": 0.0, + "step": 6706, + "text_loss": 0.2513798773288727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00032837605705680895, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10816484.0, + "repeat_count": 0.0, + "routers_loss": 0.0027157769072800875, + "skip_count": 0.0, + "step": 6708, + "text_loss": 0.34391456842422485 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0003280853781634481, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 10819794.0, + "repeat_count": 1.0, + "routers_loss": 0.0016086180694401264, + "skip_count": 1.0, + "step": 6710, + "text_loss": 0.6535179615020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003277947651480946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10823033.0, + "repeat_count": 0.0, + "routers_loss": 0.002368347719311714, + "skip_count": 0.0, + "step": 6712, + "text_loss": 0.5596423745155334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0003275042181221119, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10826276.0, + "repeat_count": 0.0, + "routers_loss": 0.003124286886304617, + "skip_count": 0.0, + "step": 6714, + "text_loss": 0.6584402322769165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003272137371968382, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10828846.0, + "repeat_count": 0.0, + "routers_loss": 0.0006088328082114458, + "skip_count": 0.0, + "step": 6716, + "text_loss": 0.4602710008621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00032692332248358645, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10832025.0, + "repeat_count": 0.0, + "routers_loss": 0.002511275466531515, + "skip_count": 2.0, + "step": 6718, + "text_loss": 0.42790886759757996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.000326632974093644, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10835110.0, + "repeat_count": 1.0, + "routers_loss": 0.01076667383313179, + "skip_count": 0.0, + "step": 6720, + "text_loss": 0.5659847855567932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0003263426921382728, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 10838279.0, + "repeat_count": 2.0, + "routers_loss": 0.004973042290657759, + "skip_count": 2.0, + "step": 6722, + "text_loss": 0.675341010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00032605247672870964, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10841381.0, + "repeat_count": 0.0, + "routers_loss": 0.0013990222942084074, + "skip_count": 0.0, + "step": 6724, + "text_loss": 0.5389315485954285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00032576232797616554, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10844583.0, + "repeat_count": 0.0, + "routers_loss": 0.003186358604580164, + "skip_count": 1.0, + "step": 6726, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003254722459918261, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10847670.0, + "repeat_count": 0.0, + "routers_loss": 0.001443870598450303, + "skip_count": 0.0, + "step": 6728, + "text_loss": 0.6922405362129211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0003251822308868512, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10851479.0, + "repeat_count": 0.0, + "routers_loss": 0.004294445738196373, + "skip_count": 0.0, + "step": 6730, + "text_loss": 0.7145437002182007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032489228277237514, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10854489.0, + "repeat_count": 0.0, + "routers_loss": 0.0032078945077955723, + "skip_count": 0.0, + "step": 6732, + "text_loss": 0.4077773094177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032460240175950664, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10856954.0, + "repeat_count": 1.0, + "routers_loss": 0.0038214854430407286, + "skip_count": 2.0, + "step": 6734, + "text_loss": 0.32071781158447266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0003243125879593286, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10860016.0, + "repeat_count": 0.0, + "routers_loss": 0.0013407845981419086, + "skip_count": 0.0, + "step": 6736, + "text_loss": 0.45335495471954346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003240228414828984, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10863021.0, + "repeat_count": 0.0, + "routers_loss": 0.0010989385191351175, + "skip_count": 0.0, + "step": 6738, + "text_loss": 0.562619149684906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003237331624412473, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10866548.0, + "repeat_count": 0.0, + "routers_loss": 0.006139552686363459, + "skip_count": 0.0, + "step": 6740, + "text_loss": 0.14510060846805573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00032344355094538087, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10869402.0, + "repeat_count": 0.0, + "routers_loss": 0.004785746335983276, + "skip_count": 0.0, + "step": 6742, + "text_loss": 0.5655979514122009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00032315400710627876, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10874165.0, + "repeat_count": 0.0, + "routers_loss": 0.0052397786639630795, + "skip_count": 0.0, + "step": 6744, + "text_loss": 0.4785873591899872 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 31.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003228645310348948, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10876919.0, + "repeat_count": 3.0, + "routers_loss": 0.00460197776556015, + "skip_count": 1.0, + "step": 6746, + "text_loss": 0.5683879256248474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0003225751228421566, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10880179.0, + "repeat_count": 0.0, + "routers_loss": 0.0032690472435206175, + "skip_count": 0.0, + "step": 6748, + "text_loss": 0.5268497467041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.00032228578263896607, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10883711.0, + "repeat_count": 0.0, + "routers_loss": 0.0036305058747529984, + "skip_count": 0.0, + "step": 6750, + "text_loss": 0.16675594449043274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003219965105361989, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10887041.0, + "repeat_count": 0.0, + "routers_loss": 0.002453352091833949, + "skip_count": 1.0, + "step": 6752, + "text_loss": 0.7010246515274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00032170730664470465, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10890053.0, + "repeat_count": 0.0, + "routers_loss": 0.0020381701178848743, + "skip_count": 0.0, + "step": 6754, + "text_loss": 0.46637895703315735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003214181710753069, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10893501.0, + "repeat_count": 0.0, + "routers_loss": 0.004525696858763695, + "skip_count": 0.0, + "step": 6756, + "text_loss": 0.1768684983253479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003211291039388026, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10896480.0, + "repeat_count": 1.0, + "routers_loss": 0.0038154330104589462, + "skip_count": 0.0, + "step": 6758, + "text_loss": 0.7908347845077515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00032084010534596326, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10899158.0, + "repeat_count": 0.0, + "routers_loss": 0.004711449146270752, + "skip_count": 2.0, + "step": 6760, + "text_loss": 0.37209007143974304 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003205511754075335, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10901791.0, + "repeat_count": 1.0, + "routers_loss": 0.0025003373157233, + "skip_count": 1.0, + "step": 6762, + "text_loss": 0.8081201314926147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00032026231423423204, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10904817.0, + "repeat_count": 0.0, + "routers_loss": 0.007387075573205948, + "skip_count": 3.0, + "step": 6764, + "text_loss": 0.30355480313301086 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003199735219367507, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 10908018.0, + "repeat_count": 2.0, + "routers_loss": 0.04275592789053917, + "skip_count": 0.0, + "step": 6766, + "text_loss": 0.26562029123306274 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.774875256824185, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003196847986257553, + "loss": 0.008, + "macro_f1": 0.9255813956260681, + "num_tokens": 10911264.0, + "repeat_count": 3.0, + "routers_loss": 0.034824032336473465, + "skip_count": 4.0, + "step": 6768, + "text_loss": 0.2761698067188263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00031939614441188523, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10915964.0, + "repeat_count": 0.0, + "routers_loss": 0.0011179742868989706, + "skip_count": 0.0, + "step": 6770, + "text_loss": 0.4107927083969116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00031910755940575344, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10918678.0, + "repeat_count": 0.0, + "routers_loss": 0.0011521469568833709, + "skip_count": 0.0, + "step": 6772, + "text_loss": 0.43064895272254944 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.000318819043717946, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10921757.0, + "repeat_count": 1.0, + "routers_loss": 0.002861087443307042, + "skip_count": 1.0, + "step": 6774, + "text_loss": 0.5945150852203369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003185305974590229, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10924767.0, + "repeat_count": 0.0, + "routers_loss": 0.0011365334503352642, + "skip_count": 0.0, + "step": 6776, + "text_loss": 0.36615172028541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0003182422207395171, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10927750.0, + "repeat_count": 1.0, + "routers_loss": 0.0034391419030725956, + "skip_count": 0.0, + "step": 6778, + "text_loss": 0.17081251740455627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003179539136699351, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10930817.0, + "repeat_count": 0.0, + "routers_loss": 0.004941808991134167, + "skip_count": 2.0, + "step": 6780, + "text_loss": 0.7683762311935425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.840622248312297, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.00031766567636075675, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 10933882.0, + "repeat_count": 1.0, + "routers_loss": 0.017502857372164726, + "skip_count": 2.0, + "step": 6782, + "text_loss": 0.38010457158088684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003173775089224353, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10936909.0, + "repeat_count": 1.0, + "routers_loss": 0.0035372809506952763, + "skip_count": 2.0, + "step": 6784, + "text_loss": 0.5760656595230103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00031708941146539707, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10940032.0, + "repeat_count": 1.0, + "routers_loss": 0.02229934185743332, + "skip_count": 0.0, + "step": 6786, + "text_loss": 0.5767728090286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00031680138410004123, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10943217.0, + "repeat_count": 0.0, + "routers_loss": 0.0028649091254919767, + "skip_count": 1.0, + "step": 6788, + "text_loss": 0.9756367802619934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00031651342693674066, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10947847.0, + "repeat_count": 0.0, + "routers_loss": 0.0039158593863248825, + "skip_count": 2.0, + "step": 6790, + "text_loss": 0.2504335045814514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000316225540085841, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10950879.0, + "repeat_count": 0.0, + "routers_loss": 0.0022091215942054987, + "skip_count": 0.0, + "step": 6792, + "text_loss": 0.525842547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00031593772365766105, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10954960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006841494468972087, + "skip_count": 0.0, + "step": 6794, + "text_loss": 0.6383582353591919 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.906369239800412, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003156499777624926, + "loss": 0.006, + "macro_f1": 0.9539539813995361, + "num_tokens": 10958278.0, + "repeat_count": 5.0, + "routers_loss": 0.03810702636837959, + "skip_count": 5.0, + "step": 6796, + "text_loss": 0.5901661515235901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0003153623025106005, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10962412.0, + "repeat_count": 0.0, + "routers_loss": 0.00046833412488922477, + "skip_count": 0.0, + "step": 6798, + "text_loss": 0.42693984508514404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00031507469801222233, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10966037.0, + "repeat_count": 0.0, + "routers_loss": 0.006818041671067476, + "skip_count": 2.0, + "step": 6800, + "text_loss": 0.5326262712478638 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00031478716437756876, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10969369.0, + "repeat_count": 0.0, + "routers_loss": 0.0029889161232858896, + "skip_count": 0.0, + "step": 6802, + "text_loss": 0.49028220772743225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003144997017168232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10972016.0, + "repeat_count": 0.0, + "routers_loss": 0.0038266500923782587, + "skip_count": 2.0, + "step": 6804, + "text_loss": 0.43391722440719604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0003142123101401417, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10975153.0, + "repeat_count": 0.0, + "routers_loss": 0.0005866789724677801, + "skip_count": 0.0, + "step": 6806, + "text_loss": 0.5888382196426392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00031392498975765353, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10977881.0, + "repeat_count": 0.0, + "routers_loss": 0.002122384263202548, + "skip_count": 0.0, + "step": 6808, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003136377406794604, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10982025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005535652744583786, + "skip_count": 0.0, + "step": 6810, + "text_loss": 0.5788959264755249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003133505630156365, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10985419.0, + "repeat_count": 0.0, + "routers_loss": 0.010623604990541935, + "skip_count": 2.0, + "step": 6812, + "text_loss": 0.18577243387699127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00031306345687622905, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10989116.0, + "repeat_count": 0.0, + "routers_loss": 0.0004721239674836397, + "skip_count": 0.0, + "step": 6814, + "text_loss": 0.4818301200866699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0003127764223712575, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10992064.0, + "repeat_count": 0.0, + "routers_loss": 0.0004238430701661855, + "skip_count": 0.0, + "step": 6816, + "text_loss": 0.7482771277427673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003124894596107141, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10994903.0, + "repeat_count": 1.0, + "routers_loss": 0.005224394146353006, + "skip_count": 2.0, + "step": 6818, + "text_loss": 0.186603844165802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00031220256870456356, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 10998692.0, + "repeat_count": 1.0, + "routers_loss": 0.0021751862950623035, + "skip_count": 2.0, + "step": 6820, + "text_loss": 0.45633986592292786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00031191574976274284, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11001284.0, + "repeat_count": 0.0, + "routers_loss": 0.004747046157717705, + "skip_count": 4.0, + "step": 6822, + "text_loss": 0.5651670694351196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003116290028951617, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 11004293.0, + "repeat_count": 0.0, + "routers_loss": 0.0008316585444845259, + "skip_count": 0.0, + "step": 6824, + "text_loss": 0.3167279362678528 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.000311342328211702, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11007080.0, + "repeat_count": 0.0, + "routers_loss": 0.0004732926026917994, + "skip_count": 0.0, + "step": 6826, + "text_loss": 0.49171411991119385 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000311055725822218, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11010078.0, + "repeat_count": 1.0, + "routers_loss": 0.004238729365170002, + "skip_count": 0.0, + "step": 6828, + "text_loss": 0.21484950184822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003107691958365361, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11013368.0, + "repeat_count": 0.0, + "routers_loss": 0.0029175232630223036, + "skip_count": 2.0, + "step": 6830, + "text_loss": 0.3718266189098358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003104827383644555, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11016704.0, + "repeat_count": 0.0, + "routers_loss": 0.00191891985014081, + "skip_count": 0.0, + "step": 6832, + "text_loss": 0.28772637248039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00031019635351574705, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 11019651.0, + "repeat_count": 0.0, + "routers_loss": 0.004300855100154877, + "skip_count": 2.0, + "step": 6834, + "text_loss": 0.6583508849143982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000309910041400154, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11023847.0, + "repeat_count": 0.0, + "routers_loss": 0.00037701442488469183, + "skip_count": 0.0, + "step": 6836, + "text_loss": 0.36090534925460815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 32.10331670090989, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0003096238021273917, + "loss": 0.0077, + "macro_f1": 0.9265305995941162, + "num_tokens": 11027804.0, + "repeat_count": 1.0, + "routers_loss": 0.03601725772023201, + "skip_count": 3.0, + "step": 6838, + "text_loss": 0.24180401861667633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.11270912826534, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00030933763580714757, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 11030778.0, + "repeat_count": 1.0, + "routers_loss": 0.023780640214681625, + "skip_count": 2.0, + "step": 6840, + "text_loss": 0.4978102743625641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030905154254908104, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11034863.0, + "repeat_count": 1.0, + "routers_loss": 0.00565778324380517, + "skip_count": 0.0, + "step": 6842, + "text_loss": 0.558772623538971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00030876552246282356, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11038488.0, + "repeat_count": 0.0, + "routers_loss": 0.010575232096016407, + "skip_count": 0.0, + "step": 6844, + "text_loss": 0.2955974340438843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003084795756579787, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11041796.0, + "repeat_count": 0.0, + "routers_loss": 0.0015910190995782614, + "skip_count": 0.0, + "step": 6846, + "text_loss": 0.5009704828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003081937022441217, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11045141.0, + "repeat_count": 0.0, + "routers_loss": 0.0008034126949496567, + "skip_count": 0.0, + "step": 6848, + "text_loss": 0.3965311646461487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003079079023307999, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11047814.0, + "repeat_count": 2.0, + "routers_loss": 0.00810160581022501, + "skip_count": 0.0, + "step": 6850, + "text_loss": 0.24341927468776703 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003076221760275321, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11051330.0, + "repeat_count": 1.0, + "routers_loss": 0.006590691395103931, + "skip_count": 0.0, + "step": 6852, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00030733652344380936, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11055006.0, + "repeat_count": 0.0, + "routers_loss": 0.0005845054984092712, + "skip_count": 0.0, + "step": 6854, + "text_loss": 0.6621366739273071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003070509446890944, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11058470.0, + "repeat_count": 0.0, + "routers_loss": 0.0041051446460187435, + "skip_count": 1.0, + "step": 6856, + "text_loss": 0.31603100895881653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0003067654398728214, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11061620.0, + "repeat_count": 1.0, + "routers_loss": 0.001603201380930841, + "skip_count": 0.0, + "step": 6858, + "text_loss": 0.5167516469955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00030648000910439636, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11064727.0, + "repeat_count": 0.0, + "routers_loss": 0.0024816282093524933, + "skip_count": 0.0, + "step": 6860, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030619465249319693, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11068208.0, + "repeat_count": 1.0, + "routers_loss": 0.003121294779703021, + "skip_count": 0.0, + "step": 6862, + "text_loss": 0.3920222818851471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0003059093701485722, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11071315.0, + "repeat_count": 0.0, + "routers_loss": 0.0033239589538425207, + "skip_count": 1.0, + "step": 6864, + "text_loss": 0.4201887845993042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00030562416217984296, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11074144.0, + "repeat_count": 0.0, + "routers_loss": 0.0016117560444399714, + "skip_count": 0.0, + "step": 6866, + "text_loss": 0.5283045172691345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0003053390286963015, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11077152.0, + "repeat_count": 0.0, + "routers_loss": 0.003879208816215396, + "skip_count": 0.0, + "step": 6868, + "text_loss": 0.16188788414001465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00030505396980721143, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11080200.0, + "repeat_count": 0.0, + "routers_loss": 0.007632353343069553, + "skip_count": 1.0, + "step": 6870, + "text_loss": 0.25986847281455994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00030476898562180793, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11083356.0, + "repeat_count": 0.0, + "routers_loss": 0.004322016146034002, + "skip_count": 2.0, + "step": 6872, + "text_loss": 0.49556297063827515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003044840762492974, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 11086354.0, + "repeat_count": 0.0, + "routers_loss": 0.0031272871419787407, + "skip_count": 2.0, + "step": 6874, + "text_loss": 0.1658666580915451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003041992417988577, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11088850.0, + "repeat_count": 0.0, + "routers_loss": 0.005371398758143187, + "skip_count": 2.0, + "step": 6876, + "text_loss": 0.22437214851379395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003039144823796378, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11091784.0, + "repeat_count": 0.0, + "routers_loss": 0.0025086402893066406, + "skip_count": 0.0, + "step": 6878, + "text_loss": 0.7293354868888855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003036297981007581, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11095204.0, + "repeat_count": 0.0, + "routers_loss": 0.015590827912092209, + "skip_count": 1.0, + "step": 6880, + "text_loss": 0.6406328678131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003033451890713103, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11098367.0, + "repeat_count": 0.0, + "routers_loss": 0.0013142531970515847, + "skip_count": 0.0, + "step": 6882, + "text_loss": 0.5209086537361145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003030606554003571, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 11101047.0, + "repeat_count": 2.0, + "routers_loss": 0.0018484699539840221, + "skip_count": 0.0, + "step": 6884, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00030277619719693217, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11104269.0, + "repeat_count": 0.0, + "routers_loss": 0.0016667681047692895, + "skip_count": 0.0, + "step": 6886, + "text_loss": 0.7918420433998108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0003024918145700406, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 11107248.0, + "repeat_count": 0.0, + "routers_loss": 0.0008098077378235757, + "skip_count": 0.0, + "step": 6888, + "text_loss": 0.3871288299560547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003022075076286582, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 11111204.0, + "repeat_count": 0.0, + "routers_loss": 0.002324736909940839, + "skip_count": 0.0, + "step": 6890, + "text_loss": 0.3722921907901764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003019232764817321, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11114363.0, + "repeat_count": 0.0, + "routers_loss": 0.00254769716411829, + "skip_count": 0.0, + "step": 6892, + "text_loss": 0.418519526720047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00030163912123818006, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11117718.0, + "repeat_count": 0.0, + "routers_loss": 0.000547234492842108, + "skip_count": 0.0, + "step": 6894, + "text_loss": 0.6087009310722351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003013550420068909, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11120437.0, + "repeat_count": 0.0, + "routers_loss": 0.00015221568173728883, + "skip_count": 0.0, + "step": 6896, + "text_loss": 0.6013991832733154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.385089521573235, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046142578125, + "learning_rate": 0.00030107103889672436, + "loss": 0.0085, + "macro_f1": 0.5492662787437439, + "num_tokens": 11123708.0, + "repeat_count": 0.0, + "routers_loss": 0.024048971012234688, + "skip_count": 2.0, + "step": 6898, + "text_loss": 0.3612423837184906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003007871120165111, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 11127294.0, + "repeat_count": 0.0, + "routers_loss": 0.0013236473314464092, + "skip_count": 0.0, + "step": 6900, + "text_loss": 0.5277031064033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00030050326147505226, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11130270.0, + "repeat_count": 0.0, + "routers_loss": 0.0028277861420065165, + "skip_count": 0.0, + "step": 6902, + "text_loss": 0.5726971626281738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003002194873811197, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11132955.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369837388396263, + "skip_count": 0.0, + "step": 6904, + "text_loss": 0.18510448932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00029993578984345673, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 11136387.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351211696863174, + "skip_count": 0.0, + "step": 6906, + "text_loss": 0.28313153982162476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0002996521689707764, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11139740.0, + "repeat_count": 0.0, + "routers_loss": 0.00032925375853665173, + "skip_count": 0.0, + "step": 6908, + "text_loss": 0.7315025329589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002993686248717629, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11142587.0, + "repeat_count": 0.0, + "routers_loss": 0.002886304398998618, + "skip_count": 0.0, + "step": 6910, + "text_loss": 0.677378237247467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029908515765507084, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 11145415.0, + "repeat_count": 1.0, + "routers_loss": 0.0038471966981887817, + "skip_count": 0.0, + "step": 6912, + "text_loss": 0.5207083225250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002988017674293254, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11148524.0, + "repeat_count": 0.0, + "routers_loss": 0.0023522782139480114, + "skip_count": 0.0, + "step": 6914, + "text_loss": 0.42507871985435486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0002985184543031222, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11152069.0, + "repeat_count": 0.0, + "routers_loss": 0.0012464249739423394, + "skip_count": 0.0, + "step": 6916, + "text_loss": 0.5694169998168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0002982352183850274, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11155675.0, + "repeat_count": 0.0, + "routers_loss": 0.00828156154602766, + "skip_count": 2.0, + "step": 6918, + "text_loss": 0.22304373979568481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00029795205978357754, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11158555.0, + "repeat_count": 0.0, + "routers_loss": 0.0019234733190387487, + "skip_count": 0.0, + "step": 6920, + "text_loss": 0.5519064664840698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0002976689786072795, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11161407.0, + "repeat_count": 0.0, + "routers_loss": 0.0003542431222740561, + "skip_count": 0.0, + "step": 6922, + "text_loss": 0.6748810410499573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002973859749646104, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11166007.0, + "repeat_count": 0.0, + "routers_loss": 0.0004024899681098759, + "skip_count": 0.0, + "step": 6924, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 32.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000297103048964018, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 11169007.0, + "repeat_count": 0.0, + "routers_loss": 0.005519595462828875, + "skip_count": 3.0, + "step": 6926, + "text_loss": 0.3815552592277527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00029682020071392, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11172939.0, + "repeat_count": 0.0, + "routers_loss": 0.0016999440267682076, + "skip_count": 0.0, + "step": 6928, + "text_loss": 0.6727893352508545 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.535368359260346, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002965374303227044, + "loss": 0.0055, + "macro_f1": 0.5492662787437439, + "num_tokens": 11176232.0, + "repeat_count": 2.0, + "routers_loss": 0.030950307846069336, + "skip_count": 0.0, + "step": 6930, + "text_loss": 0.5577763915061951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029625473789872923, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11179775.0, + "repeat_count": 0.0, + "routers_loss": 0.00525702815502882, + "skip_count": 1.0, + "step": 6932, + "text_loss": 0.5860039591789246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.000295972123550323, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11183262.0, + "repeat_count": 1.0, + "routers_loss": 0.0048187971115112305, + "skip_count": 2.0, + "step": 6934, + "text_loss": 0.7328732013702393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.00029568958738578364, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11186591.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159632312133908, + "skip_count": 0.0, + "step": 6936, + "text_loss": 0.40563541650772095 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 0.0002954071295133801, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11190056.0, + "repeat_count": 1.0, + "routers_loss": 0.011282073333859444, + "skip_count": 1.0, + "step": 6938, + "text_loss": 0.15986496210098267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002951247500413504, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11193504.0, + "repeat_count": 3.0, + "routers_loss": 0.010220487602055073, + "skip_count": 5.0, + "step": 6940, + "text_loss": 0.2604432702064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002948424490779029, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11196725.0, + "repeat_count": 0.0, + "routers_loss": 0.002620660001412034, + "skip_count": 1.0, + "step": 6942, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029456022673121597, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11199303.0, + "repeat_count": 0.0, + "routers_loss": 0.00042651945841498673, + "skip_count": 0.0, + "step": 6944, + "text_loss": 0.5135554671287537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0002942780831094377, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11202319.0, + "repeat_count": 0.0, + "routers_loss": 0.005366047378629446, + "skip_count": 2.0, + "step": 6946, + "text_loss": 0.2809196710586548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002939960183206861, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 11205622.0, + "repeat_count": 0.0, + "routers_loss": 0.0033479216508567333, + "skip_count": 0.0, + "step": 6948, + "text_loss": 0.2013140618801117 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00029371403247304887, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11208637.0, + "repeat_count": 1.0, + "routers_loss": 0.0013508419506251812, + "skip_count": 0.0, + "step": 6950, + "text_loss": 0.4427332580089569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002934321256745833, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11211618.0, + "repeat_count": 0.0, + "routers_loss": 0.0020944071002304554, + "skip_count": 0.0, + "step": 6952, + "text_loss": 0.5406652688980103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00029315029803331704, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11214432.0, + "repeat_count": 0.0, + "routers_loss": 0.0012655078899115324, + "skip_count": 0.0, + "step": 6954, + "text_loss": 0.7720552086830139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00029286854965724686, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11218127.0, + "repeat_count": 0.0, + "routers_loss": 0.009041395038366318, + "skip_count": 0.0, + "step": 6956, + "text_loss": 0.258109986782074 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0002925868806543391, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 11221440.0, + "repeat_count": 1.0, + "routers_loss": 0.0034558263141661882, + "skip_count": 1.0, + "step": 6958, + "text_loss": 0.5378029942512512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00029230529113253, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11225391.0, + "repeat_count": 0.0, + "routers_loss": 0.005263930186629295, + "skip_count": 2.0, + "step": 6960, + "text_loss": 0.3616539537906647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0002920237811997251, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11228648.0, + "repeat_count": 0.0, + "routers_loss": 0.003730480559170246, + "skip_count": 1.0, + "step": 6962, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00029174235096379963, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11231828.0, + "repeat_count": 0.0, + "routers_loss": 0.004831735976040363, + "skip_count": 1.0, + "step": 6964, + "text_loss": 0.5718355178833008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.70443205165835, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046875, + "learning_rate": 0.0002914610005325981, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 11234984.0, + "repeat_count": 0.0, + "routers_loss": 0.03880132734775543, + "skip_count": 2.0, + "step": 6966, + "text_loss": 0.3139013946056366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002911797300139345, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 11239153.0, + "repeat_count": 0.0, + "routers_loss": 0.0006673726020380855, + "skip_count": 0.0, + "step": 6968, + "text_loss": 0.6040399074554443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029089853951559235, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11242178.0, + "repeat_count": 1.0, + "routers_loss": 0.0028971200808882713, + "skip_count": 0.0, + "step": 6970, + "text_loss": 0.304967999458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00029061742914532427, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11245865.0, + "repeat_count": 0.0, + "routers_loss": 0.0010410466929897666, + "skip_count": 0.0, + "step": 6972, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0002903363990108524, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11248806.0, + "repeat_count": 0.0, + "routers_loss": 0.002133697969838977, + "skip_count": 0.0, + "step": 6974, + "text_loss": 0.2561415433883667 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0002900554492198677, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 11251807.0, + "repeat_count": 2.0, + "routers_loss": 0.002402493730187416, + "skip_count": 0.0, + "step": 6976, + "text_loss": 0.652428388595581 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0002897745798800311, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 11254615.0, + "repeat_count": 1.0, + "routers_loss": 0.006423915736377239, + "skip_count": 0.0, + "step": 6978, + "text_loss": 0.22414511442184448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.000289493791098972, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11257721.0, + "repeat_count": 0.0, + "routers_loss": 0.002536606043577194, + "skip_count": 0.0, + "step": 6980, + "text_loss": 0.1328018754720688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00028921308298428933, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11260840.0, + "repeat_count": 0.0, + "routers_loss": 0.000745086173992604, + "skip_count": 0.0, + "step": 6982, + "text_loss": 0.61724853515625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0002889324556435509, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11264279.0, + "repeat_count": 0.0, + "routers_loss": 0.005258981604129076, + "skip_count": 0.0, + "step": 6984, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028865190918429356, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11268096.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756023598834872, + "skip_count": 0.0, + "step": 6986, + "text_loss": 0.45111921429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00028837144371402336, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11270611.0, + "repeat_count": 0.0, + "routers_loss": 0.0008175788098014891, + "skip_count": 0.0, + "step": 6988, + "text_loss": 0.5332239270210266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00028809105934021517, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11273826.0, + "repeat_count": 0.0, + "routers_loss": 0.003494064789265394, + "skip_count": 0.0, + "step": 6990, + "text_loss": 0.20264241099357605 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002878107561703127, + "loss": 0.0056, + "macro_f1": 0.8817967176437378, + "num_tokens": 11276917.0, + "repeat_count": 2.0, + "routers_loss": 0.025257345288991928, + "skip_count": 3.0, + "step": 6992, + "text_loss": 0.18000070750713348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.835926034634575, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0002875305343117289, + "loss": 0.0044, + "macro_f1": 0.6603773832321167, + "num_tokens": 11279637.0, + "repeat_count": 1.0, + "routers_loss": 0.019206687808036804, + "skip_count": 1.0, + "step": 6994, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00028725039387184504, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11282717.0, + "repeat_count": 0.0, + "routers_loss": 0.009358765557408333, + "skip_count": 1.0, + "step": 6996, + "text_loss": 0.3412095904350281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00028697033495801163, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11285433.0, + "repeat_count": 1.0, + "routers_loss": 0.0038775671273469925, + "skip_count": 1.0, + "step": 6998, + "text_loss": 0.4316727817058563 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002866903576775475, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11288414.0, + "repeat_count": 1.0, + "routers_loss": 0.004292591474950314, + "skip_count": 0.0, + "step": 7000, + "text_loss": 0.45106515288352966 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9166181768738112e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7000/training_args.bin b/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880 diff --git a/checkpoint-8000/chat_template.jinja b/checkpoint-8000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-8000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-8000/config.json b/checkpoint-8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd --- /dev/null +++ b/checkpoint-8000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-8000/generation_config.json b/checkpoint-8000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35 --- /dev/null +++ b/checkpoint-8000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.55.2" +} diff --git a/checkpoint-8000/model-00001-of-00002.safetensors b/checkpoint-8000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284 --- /dev/null +++ b/checkpoint-8000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-8000/model-00002-of-00002.safetensors b/checkpoint-8000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9540acdafc29cec8964cfb0948bd82eb4dc07732 --- /dev/null +++ b/checkpoint-8000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b514f3ac35b6ff8370c10f2c52709d5aae295c3fa5dd9400fe0732477bbf00f8 +size 1481790520 diff --git a/checkpoint-8000/model.safetensors.index.json b/checkpoint-8000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-8000/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-8000/optimizer.pt b/checkpoint-8000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d96ad13047fa4bd123c403ec1d53abc4dcec4ef --- /dev/null +++ b/checkpoint-8000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2f5c6b51d746c116c247f75bea11f8702fbb09521b88bfd9a655c0d881ea0b5 +size 44191162 diff --git a/checkpoint-8000/rng_state.pth b/checkpoint-8000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2ad2ae095913360ed8c4ab8db8e13e499268627 --- /dev/null +++ b/checkpoint-8000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fecfbdc8e0af78a7c19f82a42c820a4fbcb783e0769234b434484ff0e2ab62ba +size 14244 diff --git a/checkpoint-8000/scheduler.pt b/checkpoint-8000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6ca5594101475430a6cdfa318ef890dac43ec18 --- /dev/null +++ b/checkpoint-8000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47e92d2c614e00ec0ab08a19213aa8e21fb67f3d153f7162e65ec3c6d33fcead +size 1064 diff --git a/checkpoint-8000/special_tokens_map.json b/checkpoint-8000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-8000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-8000/tokenizer.json b/checkpoint-8000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-8000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-8000/tokenizer_config.json b/checkpoint-8000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-8000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-8000/trainer_state.json b/checkpoint-8000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4d574b95cfe41acff564897b4fb8fcafbe5d82ee --- /dev/null +++ b/checkpoint-8000/trainer_state.json @@ -0,0 +1,76034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 37.55884942764896, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 28.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00043426384465025604, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9682677.0, + "repeat_count": 2.0, + "routers_loss": 0.0011284075444564223, + "skip_count": 0.0, + "step": 6002, + "text_loss": 0.28305181860923767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.000433957027399272, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9685310.0, + "repeat_count": 0.0, + "routers_loss": 0.0030473743099719286, + "skip_count": 1.0, + "step": 6004, + "text_loss": 0.3650054931640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00043365023545607965, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9687944.0, + "repeat_count": 1.0, + "routers_loss": 0.011621905490756035, + "skip_count": 2.0, + "step": 6006, + "text_loss": 0.5409000515937805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004333434689382423, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 9690932.0, + "repeat_count": 0.0, + "routers_loss": 0.0005297541501931846, + "skip_count": 0.0, + "step": 6008, + "text_loss": 0.4311029314994812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.216025829175226, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00043303672796331336, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9693972.0, + "repeat_count": 1.0, + "routers_loss": 0.06166421249508858, + "skip_count": 0.0, + "step": 6010, + "text_loss": 0.2658997178077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00043273001264883655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9697712.0, + "repeat_count": 0.0, + "routers_loss": 0.0018419031985104084, + "skip_count": 0.0, + "step": 6012, + "text_loss": 0.5813497304916382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004324233231123458, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9700746.0, + "repeat_count": 0.0, + "routers_loss": 0.003635555040091276, + "skip_count": 0.0, + "step": 6014, + "text_loss": 0.24211904406547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 28.24420311124156, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004321166594713651, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 9704087.0, + "repeat_count": 0.0, + "routers_loss": 0.021067705005407333, + "skip_count": 2.0, + "step": 6016, + "text_loss": 0.5908042788505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00043181002184340857, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9708695.0, + "repeat_count": 0.0, + "routers_loss": 0.0008712753187865019, + "skip_count": 0.0, + "step": 6018, + "text_loss": 0.7788549661636353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004315034103459803, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 9711631.0, + "repeat_count": 1.0, + "routers_loss": 0.03231092542409897, + "skip_count": 0.0, + "step": 6020, + "text_loss": 0.6127741932868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004311968250965743, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9715526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020149527117609978, + "skip_count": 2.0, + "step": 6022, + "text_loss": 0.49970078468322754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004308902662126748, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9718475.0, + "repeat_count": 0.0, + "routers_loss": 0.0031795913819223642, + "skip_count": 0.0, + "step": 6024, + "text_loss": 0.3254713714122772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00043058373381175567, + "loss": 0.004, + "macro_f1": 0.3272727429866791, + "num_tokens": 9722194.0, + "repeat_count": 0.0, + "routers_loss": 0.0148378387093544, + "skip_count": 1.0, + "step": 6026, + "text_loss": 0.17670343816280365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0004302772280112806, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 9725489.0, + "repeat_count": 1.0, + "routers_loss": 0.005742347799241543, + "skip_count": 2.0, + "step": 6028, + "text_loss": 0.26184776425361633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00042997074892870335, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9729416.0, + "repeat_count": 0.0, + "routers_loss": 0.0023561837151646614, + "skip_count": 0.0, + "step": 6030, + "text_loss": 0.3026008605957031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0004296642966814673, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9732559.0, + "repeat_count": 0.0, + "routers_loss": 0.0010108393616974354, + "skip_count": 1.0, + "step": 6032, + "text_loss": 0.43198078870773315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00042935787138700525, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 9736324.0, + "repeat_count": 2.0, + "routers_loss": 0.005443581845611334, + "skip_count": 2.0, + "step": 6034, + "text_loss": 0.24883155524730682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0004290514731627403, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 9739630.0, + "repeat_count": 1.0, + "routers_loss": 0.010645060800015926, + "skip_count": 2.0, + "step": 6036, + "text_loss": 0.24207182228565216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.0004287451021260846, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9742221.0, + "repeat_count": 0.0, + "routers_loss": 0.0008162845042534173, + "skip_count": 0.0, + "step": 6038, + "text_loss": 0.33018553256988525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004284387583944403, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9744925.0, + "repeat_count": 0.0, + "routers_loss": 0.003782407147809863, + "skip_count": 1.0, + "step": 6040, + "text_loss": 0.6600399613380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0004281324420851987, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9748103.0, + "repeat_count": 0.0, + "routers_loss": 0.0009834285592660308, + "skip_count": 0.0, + "step": 6042, + "text_loss": 0.6402350664138794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0004278261533157409, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9751128.0, + "repeat_count": 0.0, + "routers_loss": 0.004100334830582142, + "skip_count": 2.0, + "step": 6044, + "text_loss": 0.1545136719942093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0004275198922034372, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 9754140.0, + "repeat_count": 0.0, + "routers_loss": 0.0017166603356599808, + "skip_count": 1.0, + "step": 6046, + "text_loss": 0.5875935554504395 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042721365886564766, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9756945.0, + "repeat_count": 1.0, + "routers_loss": 0.00915827602148056, + "skip_count": 2.0, + "step": 6048, + "text_loss": 0.3885214328765869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00042690745341972134, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9759738.0, + "repeat_count": 0.0, + "routers_loss": 0.0057020667009055614, + "skip_count": 2.0, + "step": 6050, + "text_loss": 0.3107164204120636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00042660127598299647, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9762987.0, + "repeat_count": 0.0, + "routers_loss": 0.004196313209831715, + "skip_count": 2.0, + "step": 6052, + "text_loss": 0.3073577582836151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00042629512667280135, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 9765828.0, + "repeat_count": 0.0, + "routers_loss": 0.0023119752295315266, + "skip_count": 1.0, + "step": 6054, + "text_loss": 0.8228643536567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004259890056064527, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 9769129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021007524337619543, + "skip_count": 1.0, + "step": 6056, + "text_loss": 0.8334706425666809 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004256829129012568, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9771821.0, + "repeat_count": 1.0, + "routers_loss": 0.00671970471739769, + "skip_count": 2.0, + "step": 6058, + "text_loss": 0.17845536768436432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00042537684867450875, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9774566.0, + "repeat_count": 0.0, + "routers_loss": 0.0014770646812394261, + "skip_count": 0.0, + "step": 6060, + "text_loss": 0.4445459246635437 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.46022894041679, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00042507081304349315, + "loss": 0.0067, + "macro_f1": 0.5492662787437439, + "num_tokens": 9777909.0, + "repeat_count": 2.0, + "routers_loss": 0.014822427183389664, + "skip_count": 0.0, + "step": 6062, + "text_loss": 0.45526158809661865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004247648061254833, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9781159.0, + "repeat_count": 0.0, + "routers_loss": 0.00568385748192668, + "skip_count": 1.0, + "step": 6064, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.479013795127678, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00042445882803774173, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 9784960.0, + "repeat_count": 1.0, + "routers_loss": 0.0179694052785635, + "skip_count": 0.0, + "step": 6066, + "text_loss": 0.23591181635856628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00042415287889751966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9787941.0, + "repeat_count": 0.0, + "routers_loss": 0.0019039154285565019, + "skip_count": 0.0, + "step": 6068, + "text_loss": 0.9447930455207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004238469588220575, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9791096.0, + "repeat_count": 0.0, + "routers_loss": 0.004039563238620758, + "skip_count": 0.0, + "step": 6070, + "text_loss": 0.3134256601333618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00042354106792858446, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9794082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018352365586906672, + "skip_count": 0.0, + "step": 6072, + "text_loss": 0.5681536197662354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00042323520633431833, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9797303.0, + "repeat_count": 0.0, + "routers_loss": 0.0019325513858348131, + "skip_count": 0.0, + "step": 6074, + "text_loss": 0.2835809290409088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00042292937415646574, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9800435.0, + "repeat_count": 0.0, + "routers_loss": 0.002513401210308075, + "skip_count": 0.0, + "step": 6076, + "text_loss": 0.1931663602590561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00042262357151222265, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9803873.0, + "repeat_count": 0.0, + "routers_loss": 0.004864581860601902, + "skip_count": 0.0, + "step": 6078, + "text_loss": 0.25809767842292786 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004223177985187728, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9806438.0, + "repeat_count": 1.0, + "routers_loss": 0.004932792857289314, + "skip_count": 0.0, + "step": 6080, + "text_loss": 0.6409249305725098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00042201205529328925, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9809400.0, + "repeat_count": 0.0, + "routers_loss": 0.00590938376262784, + "skip_count": 1.0, + "step": 6082, + "text_loss": 0.31158050894737244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00042170634195293314, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9813246.0, + "repeat_count": 0.0, + "routers_loss": 0.006805860437452793, + "skip_count": 0.0, + "step": 6084, + "text_loss": 0.32945963740348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004214006586148545, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9816513.0, + "repeat_count": 0.0, + "routers_loss": 0.0010186503641307354, + "skip_count": 0.0, + "step": 6086, + "text_loss": 0.48659923672676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0004210950053961917, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9819908.0, + "repeat_count": 0.0, + "routers_loss": 0.00402973173186183, + "skip_count": 1.0, + "step": 6088, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00042078938241407174, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9822950.0, + "repeat_count": 0.0, + "routers_loss": 0.00236532068811357, + "skip_count": 1.0, + "step": 6090, + "text_loss": 0.26589256525039673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004204837897856098, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9826493.0, + "repeat_count": 1.0, + "routers_loss": 0.003072192659601569, + "skip_count": 2.0, + "step": 6092, + "text_loss": 0.5216912627220154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004201782276279096, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9829698.0, + "repeat_count": 0.0, + "routers_loss": 0.0027553171385079622, + "skip_count": 1.0, + "step": 6094, + "text_loss": 0.40127676725387573 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.61990020545935, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00041987269605806325, + "loss": 0.0045, + "macro_f1": 0.9442509412765503, + "num_tokens": 9833719.0, + "repeat_count": 4.0, + "routers_loss": 0.013845407404005527, + "skip_count": 4.0, + "step": 6096, + "text_loss": 0.23114071786403656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004195671951931509, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 9838235.0, + "repeat_count": 0.0, + "routers_loss": 0.0019887303933501244, + "skip_count": 2.0, + "step": 6098, + "text_loss": 0.7467341423034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004192617251502409, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9840867.0, + "repeat_count": 0.0, + "routers_loss": 0.0007213905337266624, + "skip_count": 0.0, + "step": 6100, + "text_loss": 0.6283472180366516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00041895628604639036, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9843827.0, + "repeat_count": 0.0, + "routers_loss": 0.003863139310851693, + "skip_count": 1.0, + "step": 6102, + "text_loss": 0.3602744936943054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00041865087799864374, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9846939.0, + "repeat_count": 0.0, + "routers_loss": 0.0013336286647245288, + "skip_count": 0.0, + "step": 6104, + "text_loss": 0.4182434678077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0004183455011240341, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 9849827.0, + "repeat_count": 0.0, + "routers_loss": 0.00038455065805464983, + "skip_count": 0.0, + "step": 6106, + "text_loss": 0.7122722864151001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 28.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004180401555395826, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 9853487.0, + "repeat_count": 3.0, + "routers_loss": 0.0038226440083235502, + "skip_count": 1.0, + "step": 6108, + "text_loss": 0.2521185576915741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004177348413622981, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9856321.0, + "repeat_count": 0.0, + "routers_loss": 0.0015809801407158375, + "skip_count": 0.0, + "step": 6110, + "text_loss": 0.423979252576828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004174295587091776, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9859238.0, + "repeat_count": 0.0, + "routers_loss": 0.0007586454739794135, + "skip_count": 0.0, + "step": 6112, + "text_loss": 0.4720100462436676 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00041712430769720593, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 9862282.0, + "repeat_count": 1.0, + "routers_loss": 0.0045816488564014435, + "skip_count": 1.0, + "step": 6114, + "text_loss": 0.279577374458313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004168190884433559, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 9865394.0, + "repeat_count": 1.0, + "routers_loss": 0.004728195257484913, + "skip_count": 1.0, + "step": 6116, + "text_loss": 0.3826395571231842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0004165139010645881, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9869165.0, + "repeat_count": 0.0, + "routers_loss": 0.006160226184874773, + "skip_count": 3.0, + "step": 6118, + "text_loss": 0.4668935537338257 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 28.732609333724685, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004162087456778509, + "loss": 0.0074, + "macro_f1": 0.9619450569152832, + "num_tokens": 9872381.0, + "repeat_count": 1.0, + "routers_loss": 0.027831824496388435, + "skip_count": 6.0, + "step": 6120, + "text_loss": 0.28708913922309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004159036224000804, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9875668.0, + "repeat_count": 0.0, + "routers_loss": 0.0030764432158321142, + "skip_count": 1.0, + "step": 6122, + "text_loss": 0.37078607082366943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004155985313482002, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9878533.0, + "repeat_count": 0.0, + "routers_loss": 0.00043521137558855116, + "skip_count": 0.0, + "step": 6124, + "text_loss": 0.34975379705429077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00041529347263912224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9881478.0, + "repeat_count": 0.0, + "routers_loss": 0.0016251741908490658, + "skip_count": 0.0, + "step": 6126, + "text_loss": 0.39166271686553955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00041498844638974535, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 9884252.0, + "repeat_count": 1.0, + "routers_loss": 0.019553523510694504, + "skip_count": 0.0, + "step": 6128, + "text_loss": 0.2309480905532837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004146834527169562, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9887485.0, + "repeat_count": 1.0, + "routers_loss": 0.0036251386627554893, + "skip_count": 0.0, + "step": 6130, + "text_loss": 0.4464457631111145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00041437849173762894, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9890711.0, + "repeat_count": 0.0, + "routers_loss": 0.0008515548543073237, + "skip_count": 0.0, + "step": 6132, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004140735635686251, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9894458.0, + "repeat_count": 1.0, + "routers_loss": 0.001084602321498096, + "skip_count": 0.0, + "step": 6134, + "text_loss": 0.32015663385391235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004137686683267938, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9897634.0, + "repeat_count": 0.0, + "routers_loss": 0.0025203595869243145, + "skip_count": 0.0, + "step": 6136, + "text_loss": 0.15804508328437805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0004134638061289715, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9901157.0, + "repeat_count": 0.0, + "routers_loss": 0.0029381231870502234, + "skip_count": 0.0, + "step": 6138, + "text_loss": 0.14375236630439758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0004131589770919819, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9903958.0, + "repeat_count": 0.0, + "routers_loss": 0.002789110178127885, + "skip_count": 0.0, + "step": 6140, + "text_loss": 0.2474033683538437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004128541813326361, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9906799.0, + "repeat_count": 2.0, + "routers_loss": 0.010770512744784355, + "skip_count": 3.0, + "step": 6142, + "text_loss": 0.2304249256849289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004125494189677325, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 9909286.0, + "repeat_count": 1.0, + "routers_loss": 0.003122122259810567, + "skip_count": 0.0, + "step": 6144, + "text_loss": 0.3781827688217163 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00041224469011405643, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9912416.0, + "repeat_count": 1.0, + "routers_loss": 0.008443298749625683, + "skip_count": 1.0, + "step": 6146, + "text_loss": 0.3004767596721649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004119399948883806, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9915290.0, + "repeat_count": 0.0, + "routers_loss": 0.0033219947945326567, + "skip_count": 1.0, + "step": 6148, + "text_loss": 0.748744547367096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0004116353334074647, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9918493.0, + "repeat_count": 1.0, + "routers_loss": 0.005501769948750734, + "skip_count": 0.0, + "step": 6150, + "text_loss": 0.330759733915329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.000411330705788056, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9921027.0, + "repeat_count": 0.0, + "routers_loss": 0.0013694261433556676, + "skip_count": 0.0, + "step": 6152, + "text_loss": 0.43070924282073975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.000411026112146888, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9924303.0, + "repeat_count": 0.0, + "routers_loss": 0.00046192589798010886, + "skip_count": 0.0, + "step": 6154, + "text_loss": 0.5674887895584106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004107215526006817, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9927065.0, + "repeat_count": 1.0, + "routers_loss": 0.004311304073780775, + "skip_count": 0.0, + "step": 6156, + "text_loss": 0.16138267517089844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004104170272661449, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9930713.0, + "repeat_count": 0.0, + "routers_loss": 0.0035845425445586443, + "skip_count": 0.0, + "step": 6158, + "text_loss": 0.18728356063365936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00041011253625997227, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9934393.0, + "repeat_count": 0.0, + "routers_loss": 0.00247366214171052, + "skip_count": 0.0, + "step": 6160, + "text_loss": 0.3624019920825958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004098080796988452, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9937457.0, + "repeat_count": 0.0, + "routers_loss": 0.003240241203457117, + "skip_count": 0.0, + "step": 6162, + "text_loss": 0.12348521500825882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0004095036576994321, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 9940523.0, + "repeat_count": 0.0, + "routers_loss": 0.001985874492675066, + "skip_count": 1.0, + "step": 6164, + "text_loss": 0.2688066363334656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00040919927037838815, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9943802.0, + "repeat_count": 0.0, + "routers_loss": 0.004264154937118292, + "skip_count": 3.0, + "step": 6166, + "text_loss": 0.49316367506980896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00040889491785235513, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9946649.0, + "repeat_count": 0.0, + "routers_loss": 0.002545441733673215, + "skip_count": 0.0, + "step": 6168, + "text_loss": 0.4079313576221466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004085906002379614, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9949800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009590961271896958, + "skip_count": 0.0, + "step": 6170, + "text_loss": 0.6166561245918274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004082863176518221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9954008.0, + "repeat_count": 0.0, + "routers_loss": 0.003795337164774537, + "skip_count": 2.0, + "step": 6172, + "text_loss": 0.4791361689567566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0004079820702105388, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9957153.0, + "repeat_count": 0.0, + "routers_loss": 0.0015634822193533182, + "skip_count": 0.0, + "step": 6174, + "text_loss": 0.7208777666091919 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.995597299677137, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004076778580306999, + "loss": 0.0056, + "macro_f1": 0.8820862174034119, + "num_tokens": 9960060.0, + "repeat_count": 2.0, + "routers_loss": 0.03223998099565506, + "skip_count": 2.0, + "step": 6176, + "text_loss": 0.6617992520332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00040737368122887983, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9963396.0, + "repeat_count": 0.0, + "routers_loss": 0.0033978577703237534, + "skip_count": 0.0, + "step": 6178, + "text_loss": 0.7339215278625488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00040706953992164, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9966364.0, + "repeat_count": 0.0, + "routers_loss": 0.0005358994239941239, + "skip_count": 0.0, + "step": 6180, + "text_loss": 0.44187214970588684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040676543422552767, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9969813.0, + "repeat_count": 0.0, + "routers_loss": 0.0018544091144576669, + "skip_count": 1.0, + "step": 6182, + "text_loss": 0.6244927048683167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004064613642570769, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9973015.0, + "repeat_count": 0.0, + "routers_loss": 0.005692692007869482, + "skip_count": 0.0, + "step": 6184, + "text_loss": 0.18860043585300446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00040615733013280784, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9976201.0, + "repeat_count": 0.0, + "routers_loss": 0.0018737476784735918, + "skip_count": 0.0, + "step": 6186, + "text_loss": 0.21189232170581818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00040585333196922687, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9979711.0, + "repeat_count": 0.0, + "routers_loss": 0.011945146135985851, + "skip_count": 2.0, + "step": 6188, + "text_loss": 0.2628154456615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00040554936988282663, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9983003.0, + "repeat_count": 0.0, + "routers_loss": 0.0036045778542757034, + "skip_count": 1.0, + "step": 6190, + "text_loss": 0.5926038026809692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0004052454439900861, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9986841.0, + "repeat_count": 0.0, + "routers_loss": 0.004170368425548077, + "skip_count": 0.0, + "step": 6192, + "text_loss": 0.3088737726211548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00040494155440747015, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9989596.0, + "repeat_count": 0.0, + "routers_loss": 0.002254750579595566, + "skip_count": 2.0, + "step": 6194, + "text_loss": 0.6309700012207031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.089228059876724, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00040463770125142987, + "loss": 0.0087, + "macro_f1": 0.8814815282821655, + "num_tokens": 9992789.0, + "repeat_count": 2.0, + "routers_loss": 0.04092822223901749, + "skip_count": 4.0, + "step": 6196, + "text_loss": 0.09625697880983353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00040433388463840213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 9995782.0, + "repeat_count": 0.0, + "routers_loss": 0.00029065192211419344, + "skip_count": 0.0, + "step": 6198, + "text_loss": 0.5600258111953735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004040301046848105, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9998712.0, + "repeat_count": 0.0, + "routers_loss": 0.0005865268758498132, + "skip_count": 0.0, + "step": 6200, + "text_loss": 0.6426429748535156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.11740534194306, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0283203125, + "learning_rate": 0.0004037263615070638, + "loss": 0.0078, + "macro_f1": 0.9265305995941162, + "num_tokens": 10002020.0, + "repeat_count": 1.0, + "routers_loss": 0.025357060134410858, + "skip_count": 3.0, + "step": 6202, + "text_loss": 0.25125735998153687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000403422655221557, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10005381.0, + "repeat_count": 0.0, + "routers_loss": 0.003139561740681529, + "skip_count": 1.0, + "step": 6204, + "text_loss": 0.3639419376850128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00040311898594467085, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10008348.0, + "repeat_count": 0.0, + "routers_loss": 0.004091196693480015, + "skip_count": 2.0, + "step": 6206, + "text_loss": 0.1602363884449005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040281535379277204, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10011171.0, + "repeat_count": 0.0, + "routers_loss": 0.005771483760327101, + "skip_count": 0.0, + "step": 6208, + "text_loss": 0.5593504905700684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000402511758882213, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10014374.0, + "repeat_count": 0.0, + "routers_loss": 0.005212264601141214, + "skip_count": 1.0, + "step": 6210, + "text_loss": 0.15668229758739471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004022082013293319, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10017327.0, + "repeat_count": 0.0, + "routers_loss": 0.0027585842180997133, + "skip_count": 1.0, + "step": 6212, + "text_loss": 0.21188466250896454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.173759906075727, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00040190468125045255, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10020518.0, + "repeat_count": 0.0, + "routers_loss": 0.013210589066147804, + "skip_count": 1.0, + "step": 6214, + "text_loss": 0.2551073729991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00040160119876188436, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10023799.0, + "repeat_count": 1.0, + "routers_loss": 0.001590219559147954, + "skip_count": 0.0, + "step": 6216, + "text_loss": 0.5634782314300537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004012977539799224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 10027107.0, + "repeat_count": 0.0, + "routers_loss": 0.003917343448847532, + "skip_count": 0.0, + "step": 6218, + "text_loss": 0.6412819027900696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004009943470208473, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 10030460.0, + "repeat_count": 0.0, + "routers_loss": 0.00874288845807314, + "skip_count": 2.0, + "step": 6220, + "text_loss": 0.13269923627376556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.000400690978000925, + "loss": 0.0075, + "macro_f1": 0.8817967176437378, + "num_tokens": 10034086.0, + "repeat_count": 2.0, + "routers_loss": 0.03736349940299988, + "skip_count": 3.0, + "step": 6222, + "text_loss": 0.4956454336643219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004003876470364075, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10037312.0, + "repeat_count": 0.0, + "routers_loss": 0.008481289260089397, + "skip_count": 2.0, + "step": 6224, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0004000843542435315, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 10040393.0, + "repeat_count": 0.0, + "routers_loss": 0.002235144842416048, + "skip_count": 0.0, + "step": 6226, + "text_loss": 0.17645306885242462 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003997810997385195, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10044386.0, + "repeat_count": 1.0, + "routers_loss": 0.004541373811662197, + "skip_count": 0.0, + "step": 6228, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00039947788363757915, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 10049046.0, + "repeat_count": 0.0, + "routers_loss": 0.0019183673430234194, + "skip_count": 1.0, + "step": 6230, + "text_loss": 0.6953724026679993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00039917470605690334, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 10051787.0, + "repeat_count": 2.0, + "routers_loss": 0.0032311067916452885, + "skip_count": 4.0, + "step": 6232, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.267684179630173, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00039887156711267043, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 10055396.0, + "repeat_count": 2.0, + "routers_loss": 0.03247373178601265, + "skip_count": 0.0, + "step": 6234, + "text_loss": 0.4239100515842438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00039856846692104363, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10058395.0, + "repeat_count": 0.0, + "routers_loss": 0.006287421099841595, + "skip_count": 3.0, + "step": 6236, + "text_loss": 0.24084535241127014 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.0003982654055981718, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10061302.0, + "repeat_count": 1.0, + "routers_loss": 0.0008686117362231016, + "skip_count": 1.0, + "step": 6238, + "text_loss": 0.4740419089794159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0003979623832601884, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10065318.0, + "repeat_count": 0.0, + "routers_loss": 0.0037686119321733713, + "skip_count": 2.0, + "step": 6240, + "text_loss": 0.43965795636177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0003976594000232123, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10068291.0, + "repeat_count": 0.0, + "routers_loss": 0.005804901942610741, + "skip_count": 0.0, + "step": 6242, + "text_loss": 0.24424348771572113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00039735645600334714, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10071645.0, + "repeat_count": 0.0, + "routers_loss": 0.002001055981963873, + "skip_count": 1.0, + "step": 6244, + "text_loss": 0.6524377465248108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0003970535513166815, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10075136.0, + "repeat_count": 0.0, + "routers_loss": 0.001252001617103815, + "skip_count": 0.0, + "step": 6246, + "text_loss": 0.22803714871406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0003967506860792893, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10078230.0, + "repeat_count": 0.0, + "routers_loss": 0.004913780372589827, + "skip_count": 1.0, + "step": 6248, + "text_loss": 0.9835516214370728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.000396447860407229, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10080852.0, + "repeat_count": 0.0, + "routers_loss": 0.0037437966093420982, + "skip_count": 2.0, + "step": 6250, + "text_loss": 0.4021640121936798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00039614507441654393, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10084139.0, + "repeat_count": 0.0, + "routers_loss": 0.005433002021163702, + "skip_count": 2.0, + "step": 6252, + "text_loss": 0.23060470819473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00039584232822326224, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10088501.0, + "repeat_count": 0.0, + "routers_loss": 0.0007705377647653222, + "skip_count": 0.0, + "step": 6254, + "text_loss": 0.5994830131530762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003955396219433969, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10091506.0, + "repeat_count": 0.0, + "routers_loss": 0.0012310115853324533, + "skip_count": 0.0, + "step": 6256, + "text_loss": 0.4639038145542145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0003952369556929455, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10096236.0, + "repeat_count": 0.0, + "routers_loss": 0.008964627049863338, + "skip_count": 2.0, + "step": 6258, + "text_loss": 0.24845287203788757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003949343295878903, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10099213.0, + "repeat_count": 0.0, + "routers_loss": 0.0033088945783674717, + "skip_count": 0.0, + "step": 6260, + "text_loss": 0.6527073979377747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00039463174374419817, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10103160.0, + "repeat_count": 2.0, + "routers_loss": 0.003462672932073474, + "skip_count": 1.0, + "step": 6262, + "text_loss": 0.4209299683570862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00039432919827782066, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10105881.0, + "repeat_count": 2.0, + "routers_loss": 0.0027124532498419285, + "skip_count": 2.0, + "step": 6264, + "text_loss": 0.4442266821861267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00039402669330469367, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10108596.0, + "repeat_count": 0.0, + "routers_loss": 0.005055282264947891, + "skip_count": 2.0, + "step": 6266, + "text_loss": 0.3331456780433655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00039372422894073765, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10111673.0, + "repeat_count": 0.0, + "routers_loss": 0.0009340311517007649, + "skip_count": 0.0, + "step": 6268, + "text_loss": 0.7664456367492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00039342180530185745, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10116141.0, + "repeat_count": 0.0, + "routers_loss": 0.00032052272581495345, + "skip_count": 0.0, + "step": 6270, + "text_loss": 0.47610244154930115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00039311942250394274, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10119151.0, + "repeat_count": 0.0, + "routers_loss": 0.0015820999396964908, + "skip_count": 0.0, + "step": 6272, + "text_loss": 0.3815282881259918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003928170806628669, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10122684.0, + "repeat_count": 0.0, + "routers_loss": 0.0007423736387863755, + "skip_count": 0.0, + "step": 6274, + "text_loss": 0.4630914628505707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00039251477989448797, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10126751.0, + "repeat_count": 0.0, + "routers_loss": 0.0006216703332029283, + "skip_count": 0.0, + "step": 6276, + "text_loss": 0.4342454671859741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.00039221252031464816, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10129784.0, + "repeat_count": 0.0, + "routers_loss": 0.004239698871970177, + "skip_count": 3.0, + "step": 6278, + "text_loss": 0.24661089479923248 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 29.4837100088054, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003919103020391738, + "loss": 0.006, + "macro_f1": 0.8803418874740601, + "num_tokens": 10133066.0, + "repeat_count": 2.0, + "routers_loss": 0.027879100292921066, + "skip_count": 7.0, + "step": 6280, + "text_loss": 0.4705188274383545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00039160812518387574, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 10136860.0, + "repeat_count": 0.0, + "routers_loss": 0.002533538034185767, + "skip_count": 0.0, + "step": 6282, + "text_loss": 0.1953880786895752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00039130598986454845, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 10140066.0, + "repeat_count": 1.0, + "routers_loss": 0.002462630858644843, + "skip_count": 2.0, + "step": 6284, + "text_loss": 0.378487765789032 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.000391003896196971, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 10143646.0, + "repeat_count": 1.0, + "routers_loss": 0.011922914534807205, + "skip_count": 1.0, + "step": 6286, + "text_loss": 0.2467316836118698 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00039070184429690607, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10146507.0, + "repeat_count": 1.0, + "routers_loss": 0.0059767309576272964, + "skip_count": 1.0, + "step": 6288, + "text_loss": 0.9603674411773682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003903998342801006, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10149301.0, + "repeat_count": 1.0, + "routers_loss": 0.0030056277755647898, + "skip_count": 2.0, + "step": 6290, + "text_loss": 0.36631715297698975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00039009786626228543, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10152158.0, + "repeat_count": 0.0, + "routers_loss": 0.005298118572682142, + "skip_count": 3.0, + "step": 6292, + "text_loss": 0.2876455783843994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003897959403591751, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 10155852.0, + "repeat_count": 0.0, + "routers_loss": 0.004937763791531324, + "skip_count": 2.0, + "step": 6294, + "text_loss": 0.14649681746959686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003894940566864683, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 10159164.0, + "repeat_count": 0.0, + "routers_loss": 0.0021474575623869896, + "skip_count": 0.0, + "step": 6296, + "text_loss": 0.5694304704666138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 29.568241855004402, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.00038919221535984753, + "loss": 0.0073, + "macro_f1": 0.875, + "num_tokens": 10161806.0, + "repeat_count": 1.0, + "routers_loss": 0.040340203791856766, + "skip_count": 3.0, + "step": 6298, + "text_loss": 0.1574537754058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038889041649497894, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10165669.0, + "repeat_count": 0.0, + "routers_loss": 0.0028486931696534157, + "skip_count": 0.0, + "step": 6300, + "text_loss": 0.9158071279525757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003885886602075123, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10168945.0, + "repeat_count": 0.0, + "routers_loss": 0.006565484683960676, + "skip_count": 2.0, + "step": 6302, + "text_loss": 0.3530846834182739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038828694661308116, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10171914.0, + "repeat_count": 0.0, + "routers_loss": 0.0009084723424166441, + "skip_count": 0.0, + "step": 6304, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0003879852758273029, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10175737.0, + "repeat_count": 1.0, + "routers_loss": 0.004121702630072832, + "skip_count": 2.0, + "step": 6306, + "text_loss": 0.5294032096862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00038768364796577814, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10178543.0, + "repeat_count": 0.0, + "routers_loss": 0.0013208909658715129, + "skip_count": 0.0, + "step": 6308, + "text_loss": 0.41084006428718567 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.62459641913707, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00038738206314409144, + "loss": 0.0079, + "macro_f1": 0.9247862696647644, + "num_tokens": 10181880.0, + "repeat_count": 3.0, + "routers_loss": 0.03674180060625076, + "skip_count": 6.0, + "step": 6310, + "text_loss": 0.6920746564865112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0003870805214778106, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10185173.0, + "repeat_count": 0.0, + "routers_loss": 0.00221974472515285, + "skip_count": 2.0, + "step": 6312, + "text_loss": 0.1376657634973526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003867790230824869, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10188642.0, + "repeat_count": 0.0, + "routers_loss": 0.001809283159673214, + "skip_count": 0.0, + "step": 6314, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003864775680736552, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10191750.0, + "repeat_count": 0.0, + "routers_loss": 0.0013956360053271055, + "skip_count": 0.0, + "step": 6316, + "text_loss": 0.4109838902950287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00038617615656683356, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10194578.0, + "repeat_count": 0.0, + "routers_loss": 0.002947692759335041, + "skip_count": 2.0, + "step": 6318, + "text_loss": 0.4818590581417084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003858747886775232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10197131.0, + "repeat_count": 0.0, + "routers_loss": 0.0008140999125316739, + "skip_count": 2.0, + "step": 6320, + "text_loss": 0.4004709720611572 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003855734645212093, + "loss": 0.0089, + "macro_f1": 0.8820862174034119, + "num_tokens": 10199965.0, + "repeat_count": 2.0, + "routers_loss": 0.013056626543402672, + "skip_count": 2.0, + "step": 6322, + "text_loss": 0.3367139995098114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00038527218421335977, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 10203184.0, + "repeat_count": 1.0, + "routers_loss": 0.0038112467154860497, + "skip_count": 2.0, + "step": 6324, + "text_loss": 0.5747989416122437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003849709478694255, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 10206436.0, + "repeat_count": 0.0, + "routers_loss": 0.001232540002092719, + "skip_count": 0.0, + "step": 6326, + "text_loss": 0.4981732964515686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00038466975560484115, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10209889.0, + "repeat_count": 0.0, + "routers_loss": 0.004343799781054258, + "skip_count": 0.0, + "step": 6328, + "text_loss": 0.2160186469554901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.000384368607535024, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10212520.0, + "repeat_count": 0.0, + "routers_loss": 0.0014161963481456041, + "skip_count": 1.0, + "step": 6330, + "text_loss": 0.3556232154369354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0003840675037753745, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10215456.0, + "repeat_count": 0.0, + "routers_loss": 0.0014989010524004698, + "skip_count": 0.0, + "step": 6332, + "text_loss": 0.8510926961898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003837664444412762, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10218558.0, + "repeat_count": 0.0, + "routers_loss": 0.006702739745378494, + "skip_count": 0.0, + "step": 6334, + "text_loss": 0.3995226323604584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0003834654296480958, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10221862.0, + "repeat_count": 0.0, + "routers_loss": 0.00826781615614891, + "skip_count": 2.0, + "step": 6336, + "text_loss": 0.3534671664237976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003831644595111825, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10224820.0, + "repeat_count": 0.0, + "routers_loss": 0.002143894787877798, + "skip_count": 0.0, + "step": 6338, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 29.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04736328125, + "learning_rate": 0.0003828635341458687, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 10227479.0, + "repeat_count": 0.0, + "routers_loss": 0.012319118715822697, + "skip_count": 2.0, + "step": 6340, + "text_loss": 0.26248639822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003825626536674697, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10231347.0, + "repeat_count": 0.0, + "routers_loss": 0.00334449321962893, + "skip_count": 0.0, + "step": 6342, + "text_loss": 0.6357201337814331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.000382261818191283, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10234347.0, + "repeat_count": 0.0, + "routers_loss": 0.0027788348961621523, + "skip_count": 0.0, + "step": 6344, + "text_loss": 0.2813846468925476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00038196102783258996, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10237105.0, + "repeat_count": 0.0, + "routers_loss": 0.001545077539049089, + "skip_count": 0.0, + "step": 6346, + "text_loss": 0.47612661123275757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0003816602827066537, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10240249.0, + "repeat_count": 0.0, + "routers_loss": 0.005602670833468437, + "skip_count": 2.0, + "step": 6348, + "text_loss": 0.18197228014469147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003813595829287204, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10243417.0, + "repeat_count": 0.0, + "routers_loss": 0.0004317959537729621, + "skip_count": 0.0, + "step": 6350, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0003810589286140186, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 10246824.0, + "repeat_count": 0.0, + "routers_loss": 0.002225276781246066, + "skip_count": 0.0, + "step": 6352, + "text_loss": 0.14129821956157684 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.831229820956853, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003807583198777599, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 10249836.0, + "repeat_count": 3.0, + "routers_loss": 0.02445496805012226, + "skip_count": 1.0, + "step": 6354, + "text_loss": 0.3237064480781555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00038045775683513786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10252900.0, + "repeat_count": 0.0, + "routers_loss": 0.0009264222462661564, + "skip_count": 0.0, + "step": 6356, + "text_loss": 0.6777551174163818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0003801572396013289, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10255526.0, + "repeat_count": 1.0, + "routers_loss": 0.007189550437033176, + "skip_count": 5.0, + "step": 6358, + "text_loss": 0.25438982248306274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00037985676829149187, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10258865.0, + "repeat_count": 0.0, + "routers_loss": 0.0014201018493622541, + "skip_count": 0.0, + "step": 6360, + "text_loss": 0.5063154101371765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0003795563430207678, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10261677.0, + "repeat_count": 0.0, + "routers_loss": 0.0035477925557643175, + "skip_count": 3.0, + "step": 6362, + "text_loss": 0.4815357029438019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.878191957734078, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003792559639042803, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 10264805.0, + "repeat_count": 0.0, + "routers_loss": 0.013723359443247318, + "skip_count": 1.0, + "step": 6364, + "text_loss": 0.5563676357269287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003789556310571351, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10267885.0, + "repeat_count": 0.0, + "routers_loss": 0.0028159532230347395, + "skip_count": 0.0, + "step": 6366, + "text_loss": 0.7284183502197266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003786553445944204, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10270934.0, + "repeat_count": 0.0, + "routers_loss": 0.0005918835522606969, + "skip_count": 0.0, + "step": 6368, + "text_loss": 0.7387746572494507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0003783551046312067, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10273818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416864581406116, + "skip_count": 0.0, + "step": 6370, + "text_loss": 0.5360285043716431 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037805491128254645, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 10276494.0, + "repeat_count": 2.0, + "routers_loss": 0.002382483799010515, + "skip_count": 1.0, + "step": 6372, + "text_loss": 0.7536854147911072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00037775476466347414, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10279719.0, + "repeat_count": 0.0, + "routers_loss": 0.0021104486659169197, + "skip_count": 1.0, + "step": 6374, + "text_loss": 0.6807253956794739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0003774546648890066, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 10283000.0, + "repeat_count": 0.0, + "routers_loss": 0.003148776013404131, + "skip_count": 2.0, + "step": 6376, + "text_loss": 0.30774110555648804 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003771546120741426, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10285666.0, + "repeat_count": 1.0, + "routers_loss": 0.007700880523771048, + "skip_count": 1.0, + "step": 6378, + "text_loss": 0.4476076364517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003768546063338631, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10289127.0, + "repeat_count": 0.0, + "routers_loss": 0.0023625255562365055, + "skip_count": 1.0, + "step": 6380, + "text_loss": 0.4350969195365906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0003765546477831307, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10292485.0, + "repeat_count": 0.0, + "routers_loss": 0.001428726245649159, + "skip_count": 0.0, + "step": 6382, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003762547365368902, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10295361.0, + "repeat_count": 0.0, + "routers_loss": 0.0027160397730767727, + "skip_count": 2.0, + "step": 6384, + "text_loss": 0.3476370573043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00037595487271006807, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10298717.0, + "repeat_count": 0.0, + "routers_loss": 0.002456068294122815, + "skip_count": 0.0, + "step": 6386, + "text_loss": 0.3634916841983795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.99090108599941, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.021240234375, + "learning_rate": 0.0003756550564175727, + "loss": 0.0049, + "macro_f1": 0.9265305995941162, + "num_tokens": 10302102.0, + "repeat_count": 1.0, + "routers_loss": 0.02546076290309429, + "skip_count": 3.0, + "step": 6388, + "text_loss": 0.2422582060098648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00037535528777429426, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10305060.0, + "repeat_count": 0.0, + "routers_loss": 0.001045907847583294, + "skip_count": 0.0, + "step": 6390, + "text_loss": 0.5563194155693054 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0003750555668951045, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10307903.0, + "repeat_count": 1.0, + "routers_loss": 0.007391332648694515, + "skip_count": 2.0, + "step": 6392, + "text_loss": 0.3423991799354553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00037475589389485744, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 10311396.0, + "repeat_count": 1.0, + "routers_loss": 0.0029360291082412004, + "skip_count": 1.0, + "step": 6394, + "text_loss": 0.9877024292945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00037445626888838807, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10314250.0, + "repeat_count": 0.0, + "routers_loss": 0.0014932662015780807, + "skip_count": 0.0, + "step": 6396, + "text_loss": 0.3978523313999176 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003741566919905133, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10316894.0, + "repeat_count": 1.0, + "routers_loss": 0.007003722712397575, + "skip_count": 5.0, + "step": 6398, + "text_loss": 0.2945566475391388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00037385716331603155, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10319603.0, + "repeat_count": 1.0, + "routers_loss": 0.006710570305585861, + "skip_count": 1.0, + "step": 6400, + "text_loss": 0.2984389662742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00037355768297972275, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10322670.0, + "repeat_count": 0.0, + "routers_loss": 0.00048738415353000164, + "skip_count": 0.0, + "step": 6402, + "text_loss": 0.483262300491333 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00037325825109634837, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10326280.0, + "repeat_count": 1.0, + "routers_loss": 0.001625525183044374, + "skip_count": 1.0, + "step": 6404, + "text_loss": 0.42678722739219666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0003729588677806513, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10329008.0, + "repeat_count": 0.0, + "routers_loss": 0.004408636130392551, + "skip_count": 0.0, + "step": 6406, + "text_loss": 0.2264070063829422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0003726595331473557, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10332533.0, + "repeat_count": 0.0, + "routers_loss": 0.0038099216762930155, + "skip_count": 2.0, + "step": 6408, + "text_loss": 0.6670092940330505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003723602473111672, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10335643.0, + "repeat_count": 1.0, + "routers_loss": 0.003097689710557461, + "skip_count": 0.0, + "step": 6410, + "text_loss": 0.45228812098503113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037206101038677274, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10338522.0, + "repeat_count": 0.0, + "routers_loss": 0.005268602631986141, + "skip_count": 1.0, + "step": 6412, + "text_loss": 0.7288079857826233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003717618224888405, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 10341516.0, + "repeat_count": 0.0, + "routers_loss": 0.004640138708055019, + "skip_count": 2.0, + "step": 6414, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00037146268373201954, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10344831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006379318656399846, + "skip_count": 0.0, + "step": 6416, + "text_loss": 0.7864460945129395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003711635942309408, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10348499.0, + "repeat_count": 0.0, + "routers_loss": 0.0004005273221991956, + "skip_count": 0.0, + "step": 6418, + "text_loss": 0.605839192867279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0003708645541002159, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10351722.0, + "repeat_count": 0.0, + "routers_loss": 0.001061634044162929, + "skip_count": 0.0, + "step": 6420, + "text_loss": 0.8226510286331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 30.150278837687114, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003705655634544374, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 10355275.0, + "repeat_count": 0.0, + "routers_loss": 0.013980664312839508, + "skip_count": 2.0, + "step": 6422, + "text_loss": 0.2709597647190094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003702666224081792, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10359702.0, + "repeat_count": 1.0, + "routers_loss": 0.0013196271611377597, + "skip_count": 0.0, + "step": 6424, + "text_loss": 0.6451483368873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00036996773107599604, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10363364.0, + "repeat_count": 0.0, + "routers_loss": 0.0028023163322359324, + "skip_count": 1.0, + "step": 6426, + "text_loss": 0.2770799398422241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0003696688895724235, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10366554.0, + "repeat_count": 0.0, + "routers_loss": 0.0011023655533790588, + "skip_count": 0.0, + "step": 6428, + "text_loss": 0.5466503500938416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0003693700980119784, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10369733.0, + "repeat_count": 0.0, + "routers_loss": 0.00230707717128098, + "skip_count": 0.0, + "step": 6430, + "text_loss": 0.45667049288749695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036907135650915824, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10373382.0, + "repeat_count": 0.0, + "routers_loss": 0.0036784098483622074, + "skip_count": 2.0, + "step": 6432, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00036877266517844115, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10376202.0, + "repeat_count": 0.0, + "routers_loss": 0.0008461157558485866, + "skip_count": 0.0, + "step": 6434, + "text_loss": 0.27238601446151733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0003684740241342863, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10380748.0, + "repeat_count": 0.0, + "routers_loss": 0.0052765593864023685, + "skip_count": 0.0, + "step": 6436, + "text_loss": 0.6182295083999634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00036817543349113355, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10386148.0, + "repeat_count": 1.0, + "routers_loss": 0.005562922917306423, + "skip_count": 2.0, + "step": 6438, + "text_loss": 0.5591027140617371 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003678768933634033, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10389385.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686366491019726, + "skip_count": 0.0, + "step": 6440, + "text_loss": 0.5158660411834717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003675784038654968, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10391893.0, + "repeat_count": 0.0, + "routers_loss": 0.0022222092375159264, + "skip_count": 1.0, + "step": 6442, + "text_loss": 0.2865697741508484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003672799651117958, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 10395082.0, + "repeat_count": 0.0, + "routers_loss": 0.0030799773521721363, + "skip_count": 2.0, + "step": 6444, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003669815772166625, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10398015.0, + "repeat_count": 0.0, + "routers_loss": 0.0035721305757761, + "skip_count": 3.0, + "step": 6446, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00036668324029443975, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10400749.0, + "repeat_count": 0.0, + "routers_loss": 0.00741040613502264, + "skip_count": 4.0, + "step": 6448, + "text_loss": 0.3922366201877594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0003663849544594507, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 10404439.0, + "repeat_count": 0.0, + "routers_loss": 0.002974750241264701, + "skip_count": 2.0, + "step": 6450, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.00036608671982599927, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10408476.0, + "repeat_count": 0.0, + "routers_loss": 0.004810616374015808, + "skip_count": 0.0, + "step": 6452, + "text_loss": 0.3928622305393219 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003657885365083694, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10411533.0, + "repeat_count": 1.0, + "routers_loss": 0.005527745466679335, + "skip_count": 0.0, + "step": 6454, + "text_loss": 0.22816279530525208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.00036549040462082556, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10414501.0, + "repeat_count": 0.0, + "routers_loss": 0.0021297158673405647, + "skip_count": 0.0, + "step": 6456, + "text_loss": 0.20487719774246216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 30.31934253008512, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003651923242776124, + "loss": 0.0082, + "macro_f1": 0.6592592597007751, + "num_tokens": 10418296.0, + "repeat_count": 1.0, + "routers_loss": 0.046412210911512375, + "skip_count": 5.0, + "step": 6458, + "text_loss": 0.2890419065952301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00036489429559295484, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10421211.0, + "repeat_count": 0.0, + "routers_loss": 0.004002603702247143, + "skip_count": 0.0, + "step": 6460, + "text_loss": 0.23165544867515564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003645963186810581, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 10424231.0, + "repeat_count": 0.0, + "routers_loss": 0.003480088198557496, + "skip_count": 1.0, + "step": 6462, + "text_loss": 0.6286683082580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003642983936561075, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10427387.0, + "repeat_count": 0.0, + "routers_loss": 0.009358933195471764, + "skip_count": 2.0, + "step": 6464, + "text_loss": 0.3258316218852997 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.356912239506897, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00036400052063226816, + "loss": 0.0048, + "macro_f1": 0.9539539813995361, + "num_tokens": 10430813.0, + "repeat_count": 5.0, + "routers_loss": 0.03567950055003166, + "skip_count": 5.0, + "step": 6466, + "text_loss": 0.7278715968132019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036370269972368615, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 10434175.0, + "repeat_count": 1.0, + "routers_loss": 0.00226925453171134, + "skip_count": 2.0, + "step": 6468, + "text_loss": 0.5652450919151306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0003634049310444867, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10437393.0, + "repeat_count": 0.0, + "routers_loss": 0.0013644809368997812, + "skip_count": 0.0, + "step": 6470, + "text_loss": 0.5985191464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003631072147087753, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 10440412.0, + "repeat_count": 0.0, + "routers_loss": 0.0003114990540780127, + "skip_count": 0.0, + "step": 6472, + "text_loss": 0.5588209629058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00036280955083063747, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10443471.0, + "repeat_count": 0.0, + "routers_loss": 0.0005486322334036231, + "skip_count": 0.0, + "step": 6474, + "text_loss": 0.6969016194343567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00036251193952413865, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10446548.0, + "repeat_count": 1.0, + "routers_loss": 0.008256378583610058, + "skip_count": 2.0, + "step": 6476, + "text_loss": 0.27083566784858704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0003622143809033239, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10449478.0, + "repeat_count": 0.0, + "routers_loss": 0.001008771825581789, + "skip_count": 0.0, + "step": 6478, + "text_loss": 0.1689433604478836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00036191687508221827, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10453017.0, + "repeat_count": 1.0, + "routers_loss": 0.0014678959269076586, + "skip_count": 0.0, + "step": 6480, + "text_loss": 0.9571998715400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0003616194221748267, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10456061.0, + "repeat_count": 0.0, + "routers_loss": 0.001516164978966117, + "skip_count": 0.0, + "step": 6482, + "text_loss": 0.5750429034233093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0003613220222951335, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10459130.0, + "repeat_count": 0.0, + "routers_loss": 0.0031315975356847048, + "skip_count": 0.0, + "step": 6484, + "text_loss": 0.47120073437690735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0003610246755571029, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10462190.0, + "repeat_count": 0.0, + "routers_loss": 0.0006079549202695489, + "skip_count": 0.0, + "step": 6486, + "text_loss": 0.8426173329353333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000360727382074679, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10465233.0, + "repeat_count": 0.0, + "routers_loss": 0.00596054969355464, + "skip_count": 0.0, + "step": 6488, + "text_loss": 0.18435880541801453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.469621367772234, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00036043014196178463, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 10468135.0, + "repeat_count": 0.0, + "routers_loss": 0.008584967814385891, + "skip_count": 1.0, + "step": 6490, + "text_loss": 0.3827758729457855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00036013295533232344, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10471032.0, + "repeat_count": 2.0, + "routers_loss": 0.005076571833342314, + "skip_count": 5.0, + "step": 6492, + "text_loss": 0.1215854063630104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 30.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003598358223001776, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10474779.0, + "repeat_count": 3.0, + "routers_loss": 0.005972118582576513, + "skip_count": 0.0, + "step": 6494, + "text_loss": 0.22768665850162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003595387429792091, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10478015.0, + "repeat_count": 0.0, + "routers_loss": 0.004733685404062271, + "skip_count": 1.0, + "step": 6496, + "text_loss": 0.5013535618782043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00035924171748325916, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10481113.0, + "repeat_count": 0.0, + "routers_loss": 0.01148980576545, + "skip_count": 2.0, + "step": 6498, + "text_loss": 0.3281762897968292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003589447459261487, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10484049.0, + "repeat_count": 0.0, + "routers_loss": 0.007726775947958231, + "skip_count": 2.0, + "step": 6500, + "text_loss": 0.46294569969177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00035864782842167763, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10487443.0, + "repeat_count": 1.0, + "routers_loss": 0.0013331319205462933, + "skip_count": 0.0, + "step": 6502, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00035835096508362544, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10490535.0, + "repeat_count": 0.0, + "routers_loss": 0.0011629529763013124, + "skip_count": 0.0, + "step": 6504, + "text_loss": 0.40683525800704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00035805415602575054, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10493575.0, + "repeat_count": 0.0, + "routers_loss": 0.004780632443726063, + "skip_count": 0.0, + "step": 6506, + "text_loss": 0.37263134121894836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00035775740136179075, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10496193.0, + "repeat_count": 0.0, + "routers_loss": 0.0018355643842369318, + "skip_count": 0.0, + "step": 6508, + "text_loss": 0.2074306458234787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00035746070120546314, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10500135.0, + "repeat_count": 0.0, + "routers_loss": 0.004067617934197187, + "skip_count": 1.0, + "step": 6510, + "text_loss": 0.26313406229019165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00035716405567046383, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10503533.0, + "repeat_count": 0.0, + "routers_loss": 0.005438363179564476, + "skip_count": 0.0, + "step": 6512, + "text_loss": 0.3448122441768646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00035686746487046767, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 10506207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012895528925582767, + "skip_count": 0.0, + "step": 6514, + "text_loss": 0.43096476793289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003565709289191291, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10509257.0, + "repeat_count": 0.0, + "routers_loss": 0.003141741268336773, + "skip_count": 0.0, + "step": 6516, + "text_loss": 0.22349724173545837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003562744479300811, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10512554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005669888923875988, + "skip_count": 0.0, + "step": 6518, + "text_loss": 0.5319190621376038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00035597802201693587, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10515720.0, + "repeat_count": 0.0, + "routers_loss": 0.0020814717281609774, + "skip_count": 0.0, + "step": 6520, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003556816512932841, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10518517.0, + "repeat_count": 2.0, + "routers_loss": 0.010716461576521397, + "skip_count": 3.0, + "step": 6522, + "text_loss": 0.15843836963176727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0003553853358726959, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10521414.0, + "repeat_count": 0.0, + "routers_loss": 0.0014748790999874473, + "skip_count": 0.0, + "step": 6524, + "text_loss": 0.393892377614975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00035508907586871984, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10524210.0, + "repeat_count": 0.0, + "routers_loss": 0.0004757299611810595, + "skip_count": 0.0, + "step": 6526, + "text_loss": 0.2557907700538635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00035479287139488327, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10527327.0, + "repeat_count": 1.0, + "routers_loss": 0.002445317106321454, + "skip_count": 0.0, + "step": 6528, + "text_loss": 0.48338422179222107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003544967225646922, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10530363.0, + "repeat_count": 0.0, + "routers_loss": 0.0015845977468416095, + "skip_count": 0.0, + "step": 6530, + "text_loss": 0.6474354267120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00035420062949163166, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10533444.0, + "repeat_count": 0.0, + "routers_loss": 0.002190655330196023, + "skip_count": 0.0, + "step": 6532, + "text_loss": 0.3789777457714081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0003539045922891649, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10536711.0, + "repeat_count": 0.0, + "routers_loss": 0.00317079434171319, + "skip_count": 0.0, + "step": 6534, + "text_loss": 0.25758084654808044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00035360861107073394, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 10539849.0, + "repeat_count": 0.0, + "routers_loss": 0.0010938458144664764, + "skip_count": 0.0, + "step": 6536, + "text_loss": 0.9821014404296875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003533126859497592, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10543004.0, + "repeat_count": 0.0, + "routers_loss": 0.003071998478844762, + "skip_count": 2.0, + "step": 6538, + "text_loss": 0.6314182281494141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003530168170396401, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10545965.0, + "repeat_count": 0.0, + "routers_loss": 0.006067665759474039, + "skip_count": 2.0, + "step": 6540, + "text_loss": 0.5021927356719971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.000352721004453754, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10549188.0, + "repeat_count": 0.0, + "routers_loss": 0.0019109295681118965, + "skip_count": 0.0, + "step": 6542, + "text_loss": 0.3008780777454376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00035242524830545683, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10552298.0, + "repeat_count": 0.0, + "routers_loss": 0.007457790896296501, + "skip_count": 3.0, + "step": 6544, + "text_loss": 0.5675695538520813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003521295487080829, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 10555123.0, + "repeat_count": 0.0, + "routers_loss": 0.007243642583489418, + "skip_count": 1.0, + "step": 6546, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00035183390577494476, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10559653.0, + "repeat_count": 0.0, + "routers_loss": 0.004024330526590347, + "skip_count": 0.0, + "step": 6548, + "text_loss": 0.2634682357311249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 0.0003515383196193336, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10563770.0, + "repeat_count": 1.0, + "routers_loss": 0.010837121866643429, + "skip_count": 0.0, + "step": 6550, + "text_loss": 0.1608252227306366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0003512427903545183, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10567117.0, + "repeat_count": 0.0, + "routers_loss": 0.003473864868283272, + "skip_count": 0.0, + "step": 6552, + "text_loss": 0.231611430644989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0003509473180937464, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10570622.0, + "repeat_count": 0.0, + "routers_loss": 0.004441239405423403, + "skip_count": 1.0, + "step": 6554, + "text_loss": 0.3193909227848053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506519029502433, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10573411.0, + "repeat_count": 0.0, + "routers_loss": 0.0008821079391054809, + "skip_count": 0.0, + "step": 6556, + "text_loss": 0.4478783905506134 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0003503565450372128, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10576422.0, + "repeat_count": 1.0, + "routers_loss": 0.0014448441797867417, + "skip_count": 0.0, + "step": 6558, + "text_loss": 0.46065983176231384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003500612444678365, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10579879.0, + "repeat_count": 0.0, + "routers_loss": 0.007939066737890244, + "skip_count": 1.0, + "step": 6560, + "text_loss": 0.3299395740032196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000349766001355274, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10583067.0, + "repeat_count": 0.0, + "routers_loss": 0.010073966346681118, + "skip_count": 2.0, + "step": 6562, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00034947081581266335, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10586276.0, + "repeat_count": 0.0, + "routers_loss": 0.0062315030954778194, + "skip_count": 1.0, + "step": 6564, + "text_loss": 0.22706018388271332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003491756879531201, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10589257.0, + "repeat_count": 3.0, + "routers_loss": 0.0023778853937983513, + "skip_count": 4.0, + "step": 6566, + "text_loss": 0.5567800998687744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003488806178897377, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10592163.0, + "repeat_count": 0.0, + "routers_loss": 0.0004184350254945457, + "skip_count": 0.0, + "step": 6568, + "text_loss": 0.4027897119522095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003485856057355876, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 10595326.0, + "repeat_count": 0.0, + "routers_loss": 0.0035254736430943012, + "skip_count": 1.0, + "step": 6570, + "text_loss": 0.3044572174549103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000348290651603719, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10598236.0, + "repeat_count": 0.0, + "routers_loss": 0.0030894684605300426, + "skip_count": 0.0, + "step": 6572, + "text_loss": 0.23021161556243896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00034799575560715896, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10601653.0, + "repeat_count": 1.0, + "routers_loss": 0.0036557347048074007, + "skip_count": 0.0, + "step": 6574, + "text_loss": 0.5437754392623901 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003477009178589121, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10604581.0, + "repeat_count": 2.0, + "routers_loss": 0.021344119682908058, + "skip_count": 4.0, + "step": 6576, + "text_loss": 0.29078927636146545 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0003474061384719608, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10607676.0, + "repeat_count": 1.0, + "routers_loss": 0.0037169242277741432, + "skip_count": 1.0, + "step": 6578, + "text_loss": 1.1790896654129028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003471114175592649, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10611269.0, + "repeat_count": 2.0, + "routers_loss": 0.005873420741409063, + "skip_count": 4.0, + "step": 6580, + "text_loss": 0.36204129457473755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0003468167552337624, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 10614335.0, + "repeat_count": 1.0, + "routers_loss": 0.01030842587351799, + "skip_count": 2.0, + "step": 6582, + "text_loss": 0.20400437712669373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.00034652215160836826, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10617565.0, + "repeat_count": 0.0, + "routers_loss": 0.0025721401907503605, + "skip_count": 0.0, + "step": 6584, + "text_loss": 0.44676345586776733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00034622760679597507, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10620706.0, + "repeat_count": 0.0, + "routers_loss": 0.005751762073487043, + "skip_count": 1.0, + "step": 6586, + "text_loss": 0.4733653664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00034593312090945306, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10623916.0, + "repeat_count": 0.0, + "routers_loss": 0.0029759553726762533, + "skip_count": 3.0, + "step": 6588, + "text_loss": 0.49876922369003296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003456386940616498, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10628093.0, + "repeat_count": 0.0, + "routers_loss": 0.0010031822603195906, + "skip_count": 0.0, + "step": 6590, + "text_loss": 0.42708611488342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00034534432636539004, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10631739.0, + "repeat_count": 0.0, + "routers_loss": 0.0014793311711400747, + "skip_count": 0.0, + "step": 6592, + "text_loss": 0.18193726241588593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003450500179334762, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10634862.0, + "repeat_count": 0.0, + "routers_loss": 0.0059733521193265915, + "skip_count": 2.0, + "step": 6594, + "text_loss": 0.28596529364585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003447557688786879, + "loss": 0.0043, + "macro_f1": 0.3272727429866791, + "num_tokens": 10637758.0, + "repeat_count": 0.0, + "routers_loss": 0.0076768649742007256, + "skip_count": 1.0, + "step": 6596, + "text_loss": 0.39428210258483887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00034446157931378185, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10640440.0, + "repeat_count": 0.0, + "routers_loss": 0.0015128811355680227, + "skip_count": 0.0, + "step": 6598, + "text_loss": 0.45584383606910706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00034416744935149193, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10643600.0, + "repeat_count": 0.0, + "routers_loss": 0.000757391273509711, + "skip_count": 0.0, + "step": 6600, + "text_loss": 0.503209114074707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003438733791045294, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10646907.0, + "repeat_count": 0.0, + "routers_loss": 0.0025944956578314304, + "skip_count": 2.0, + "step": 6602, + "text_loss": 0.4370735287666321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00034357936868558255, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10649995.0, + "repeat_count": 0.0, + "routers_loss": 0.0006543452036567032, + "skip_count": 0.0, + "step": 6604, + "text_loss": 0.4125586748123169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00034328541820731663, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10653251.0, + "repeat_count": 0.0, + "routers_loss": 0.00027016724925488234, + "skip_count": 1.0, + "step": 6606, + "text_loss": 0.7309898734092712 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.023481068388612, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.020751953125, + "learning_rate": 0.00034299152778237413, + "loss": 0.0062, + "macro_f1": 0.8823530077934265, + "num_tokens": 10657229.0, + "repeat_count": 1.0, + "routers_loss": 0.01905548945069313, + "skip_count": 2.0, + "step": 6608, + "text_loss": 0.42367079854011536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0003426976975233744, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10660524.0, + "repeat_count": 0.0, + "routers_loss": 0.0004718089767266065, + "skip_count": 0.0, + "step": 6610, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00034240392754291343, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10663908.0, + "repeat_count": 1.0, + "routers_loss": 0.0027069442439824343, + "skip_count": 0.0, + "step": 6612, + "text_loss": 0.859471321105957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000342110217953565, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10667814.0, + "repeat_count": 0.0, + "routers_loss": 0.0015497280983254313, + "skip_count": 0.0, + "step": 6614, + "text_loss": 0.18337638676166534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003418165688678788, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10671630.0, + "repeat_count": 0.0, + "routers_loss": 0.0013396464055404067, + "skip_count": 0.0, + "step": 6616, + "text_loss": 0.860016405582428 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003415229803983819, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10675308.0, + "repeat_count": 0.0, + "routers_loss": 0.007542039267718792, + "skip_count": 3.0, + "step": 6618, + "text_loss": 0.15481022000312805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003412294526575779, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10678092.0, + "repeat_count": 0.0, + "routers_loss": 0.002029839437454939, + "skip_count": 2.0, + "step": 6620, + "text_loss": 0.5121933221817017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00034093598575794706, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10681382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013001341139897704, + "skip_count": 0.0, + "step": 6622, + "text_loss": 0.4555061161518097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00034064257981194655, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10684255.0, + "repeat_count": 0.0, + "routers_loss": 0.0007926415419206023, + "skip_count": 0.0, + "step": 6624, + "text_loss": 0.7298227548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003403492349320101, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 10686904.0, + "repeat_count": 0.0, + "routers_loss": 0.0021080176811665297, + "skip_count": 1.0, + "step": 6626, + "text_loss": 0.45434215664863586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.000340055951230548, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10690311.0, + "repeat_count": 0.0, + "routers_loss": 0.004011874087154865, + "skip_count": 0.0, + "step": 6628, + "text_loss": 0.15496443212032318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00033976272881994707, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10693395.0, + "repeat_count": 0.0, + "routers_loss": 0.0031893099658191204, + "skip_count": 2.0, + "step": 6630, + "text_loss": 0.5291517972946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003394695678125708, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 10697046.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124347683042288, + "skip_count": 1.0, + "step": 6632, + "text_loss": 0.2893230617046356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033917646832075886, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10700111.0, + "repeat_count": 0.0, + "routers_loss": 0.002547801472246647, + "skip_count": 0.0, + "step": 6634, + "text_loss": 0.10363512486219406 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 31.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003388834304568275, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 10703939.0, + "repeat_count": 2.0, + "routers_loss": 0.0019040531478822231, + "skip_count": 0.0, + "step": 6636, + "text_loss": 0.5185034275054932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00033859045433306975, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 10707187.0, + "repeat_count": 0.0, + "routers_loss": 0.0074104927480220795, + "skip_count": 2.0, + "step": 6638, + "text_loss": 0.1618153154850006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0003382975400617543, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10710029.0, + "repeat_count": 0.0, + "routers_loss": 0.0013861875049769878, + "skip_count": 1.0, + "step": 6640, + "text_loss": 0.6674485206604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003380046877551266, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10713318.0, + "repeat_count": 0.0, + "routers_loss": 0.0034452753607183695, + "skip_count": 0.0, + "step": 6642, + "text_loss": 0.39299124479293823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003377118975254082, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10716130.0, + "repeat_count": 0.0, + "routers_loss": 0.006802885327488184, + "skip_count": 2.0, + "step": 6644, + "text_loss": 0.12942606210708618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.20193718814206, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003374191694847968, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 10719400.0, + "repeat_count": 1.0, + "routers_loss": 0.03718209266662598, + "skip_count": 2.0, + "step": 6646, + "text_loss": 0.34327754378318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0003371265037454663, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10722108.0, + "repeat_count": 0.0, + "routers_loss": 0.006016947794705629, + "skip_count": 2.0, + "step": 6648, + "text_loss": 0.15644726157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.220722042852948, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00033683390041956663, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 10725709.0, + "repeat_count": 1.0, + "routers_loss": 0.04308273270726204, + "skip_count": 2.0, + "step": 6650, + "text_loss": 0.1875772923231125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 31.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003365413596192243, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 10728717.0, + "repeat_count": 2.0, + "routers_loss": 0.006372809875756502, + "skip_count": 1.0, + "step": 6652, + "text_loss": 0.4948291778564453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00033624888145654137, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10732082.0, + "repeat_count": 0.0, + "routers_loss": 0.0014530479675158858, + "skip_count": 0.0, + "step": 6654, + "text_loss": 0.44932305812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00033595646604359585, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10734663.0, + "repeat_count": 0.0, + "routers_loss": 0.001924810465425253, + "skip_count": 0.0, + "step": 6656, + "text_loss": 0.45626893639564514 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00033566411349244206, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10737470.0, + "repeat_count": 1.0, + "routers_loss": 0.0040014320984482765, + "skip_count": 0.0, + "step": 6658, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00033537182391510996, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10740228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008573737577535212, + "skip_count": 0.0, + "step": 6660, + "text_loss": 0.5626822113990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003350795974236055, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10742883.0, + "repeat_count": 0.0, + "routers_loss": 0.011166860349476337, + "skip_count": 1.0, + "step": 6662, + "text_loss": 0.23357805609703064 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 31.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00033478743412991037, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10746459.0, + "repeat_count": 1.0, + "routers_loss": 0.01719980500638485, + "skip_count": 6.0, + "step": 6664, + "text_loss": 0.150017648935318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033449533414598223, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 10749984.0, + "repeat_count": 0.0, + "routers_loss": 0.0038280142471194267, + "skip_count": 2.0, + "step": 6666, + "text_loss": 0.6312657594680786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033420329758375423, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 10752792.0, + "repeat_count": 0.0, + "routers_loss": 0.0007688060286454856, + "skip_count": 1.0, + "step": 6668, + "text_loss": 0.6794863939285278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00033391132455513537, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10756125.0, + "repeat_count": 0.0, + "routers_loss": 0.003196930279955268, + "skip_count": 2.0, + "step": 6670, + "text_loss": 0.22897565364837646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003336194151720102, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10759296.0, + "repeat_count": 0.0, + "routers_loss": 0.0026212623342871666, + "skip_count": 0.0, + "step": 6672, + "text_loss": 0.5236268639564514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003333275695462391, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10762574.0, + "repeat_count": 0.0, + "routers_loss": 0.007855101488530636, + "skip_count": 2.0, + "step": 6674, + "text_loss": 0.2971038818359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003330357877896577, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10765758.0, + "repeat_count": 0.0, + "routers_loss": 0.004191791173070669, + "skip_count": 2.0, + "step": 6676, + "text_loss": 0.17358586192131042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0003327440700140774, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10769396.0, + "repeat_count": 0.0, + "routers_loss": 0.004101858474314213, + "skip_count": 1.0, + "step": 6678, + "text_loss": 0.28932204842567444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.000332452416331285, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10772605.0, + "repeat_count": 0.0, + "routers_loss": 0.0008305918308906257, + "skip_count": 0.0, + "step": 6680, + "text_loss": 0.47090092301368713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0003321608268530427, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10776576.0, + "repeat_count": 0.0, + "routers_loss": 0.003022305201739073, + "skip_count": 1.0, + "step": 6682, + "text_loss": 0.4467788338661194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033186930169108795, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10779648.0, + "repeat_count": 1.0, + "routers_loss": 0.0021474999375641346, + "skip_count": 0.0, + "step": 6684, + "text_loss": 0.6249470710754395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.00033157784095713417, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 10782665.0, + "repeat_count": 0.0, + "routers_loss": 0.0025120675563812256, + "skip_count": 1.0, + "step": 6686, + "text_loss": 0.6763803958892822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003312864447628695, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10785789.0, + "repeat_count": 0.0, + "routers_loss": 0.0013111691223457456, + "skip_count": 1.0, + "step": 6688, + "text_loss": 0.6609058380126953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00033099511321995744, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 10788846.0, + "repeat_count": 0.0, + "routers_loss": 0.0012354454956948757, + "skip_count": 0.0, + "step": 6690, + "text_loss": 0.4421829283237457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0003307038464400368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10791611.0, + "repeat_count": 0.0, + "routers_loss": 0.0035219944547861814, + "skip_count": 2.0, + "step": 6692, + "text_loss": 0.16222824156284332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00033041264453472153, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10794868.0, + "repeat_count": 1.0, + "routers_loss": 0.0007216202793642879, + "skip_count": 0.0, + "step": 6694, + "text_loss": 0.37388721108436584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 31.436747872028178, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0003301215076156008, + "loss": 0.0063, + "macro_f1": 0.8803418874740601, + "num_tokens": 10797737.0, + "repeat_count": 2.0, + "routers_loss": 0.025403080508112907, + "skip_count": 7.0, + "step": 6696, + "text_loss": 0.5086690187454224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003298304357942389, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10800972.0, + "repeat_count": 0.0, + "routers_loss": 0.010532539337873459, + "skip_count": 2.0, + "step": 6698, + "text_loss": 0.22500646114349365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00032953942918217494, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10803654.0, + "repeat_count": 0.0, + "routers_loss": 0.0009591903653927147, + "skip_count": 0.0, + "step": 6700, + "text_loss": 0.6256277561187744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003292484878909232, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10807506.0, + "repeat_count": 0.0, + "routers_loss": 0.003801517654210329, + "skip_count": 2.0, + "step": 6702, + "text_loss": 0.522081196308136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00032895761203197317, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 10810163.0, + "repeat_count": 0.0, + "routers_loss": 0.002608039416372776, + "skip_count": 2.0, + "step": 6704, + "text_loss": 0.3600201904773712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00032866680171678874, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10813202.0, + "repeat_count": 0.0, + "routers_loss": 0.0026464913971722126, + "skip_count": 0.0, + "step": 6706, + "text_loss": 0.2513798773288727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00032837605705680895, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10816484.0, + "repeat_count": 0.0, + "routers_loss": 0.0027157769072800875, + "skip_count": 0.0, + "step": 6708, + "text_loss": 0.34391456842422485 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0003280853781634481, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 10819794.0, + "repeat_count": 1.0, + "routers_loss": 0.0016086180694401264, + "skip_count": 1.0, + "step": 6710, + "text_loss": 0.6535179615020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003277947651480946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10823033.0, + "repeat_count": 0.0, + "routers_loss": 0.002368347719311714, + "skip_count": 0.0, + "step": 6712, + "text_loss": 0.5596423745155334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0003275042181221119, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10826276.0, + "repeat_count": 0.0, + "routers_loss": 0.003124286886304617, + "skip_count": 0.0, + "step": 6714, + "text_loss": 0.6584402322769165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003272137371968382, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10828846.0, + "repeat_count": 0.0, + "routers_loss": 0.0006088328082114458, + "skip_count": 0.0, + "step": 6716, + "text_loss": 0.4602710008621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00032692332248358645, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10832025.0, + "repeat_count": 0.0, + "routers_loss": 0.002511275466531515, + "skip_count": 2.0, + "step": 6718, + "text_loss": 0.42790886759757996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.000326632974093644, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10835110.0, + "repeat_count": 1.0, + "routers_loss": 0.01076667383313179, + "skip_count": 0.0, + "step": 6720, + "text_loss": 0.5659847855567932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0003263426921382728, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 10838279.0, + "repeat_count": 2.0, + "routers_loss": 0.004973042290657759, + "skip_count": 2.0, + "step": 6722, + "text_loss": 0.675341010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00032605247672870964, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10841381.0, + "repeat_count": 0.0, + "routers_loss": 0.0013990222942084074, + "skip_count": 0.0, + "step": 6724, + "text_loss": 0.5389315485954285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00032576232797616554, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10844583.0, + "repeat_count": 0.0, + "routers_loss": 0.003186358604580164, + "skip_count": 1.0, + "step": 6726, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003254722459918261, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10847670.0, + "repeat_count": 0.0, + "routers_loss": 0.001443870598450303, + "skip_count": 0.0, + "step": 6728, + "text_loss": 0.6922405362129211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0003251822308868512, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10851479.0, + "repeat_count": 0.0, + "routers_loss": 0.004294445738196373, + "skip_count": 0.0, + "step": 6730, + "text_loss": 0.7145437002182007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032489228277237514, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10854489.0, + "repeat_count": 0.0, + "routers_loss": 0.0032078945077955723, + "skip_count": 0.0, + "step": 6732, + "text_loss": 0.4077773094177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032460240175950664, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10856954.0, + "repeat_count": 1.0, + "routers_loss": 0.0038214854430407286, + "skip_count": 2.0, + "step": 6734, + "text_loss": 0.32071781158447266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0003243125879593286, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10860016.0, + "repeat_count": 0.0, + "routers_loss": 0.0013407845981419086, + "skip_count": 0.0, + "step": 6736, + "text_loss": 0.45335495471954346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003240228414828984, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10863021.0, + "repeat_count": 0.0, + "routers_loss": 0.0010989385191351175, + "skip_count": 0.0, + "step": 6738, + "text_loss": 0.562619149684906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003237331624412473, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10866548.0, + "repeat_count": 0.0, + "routers_loss": 0.006139552686363459, + "skip_count": 0.0, + "step": 6740, + "text_loss": 0.14510060846805573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00032344355094538087, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10869402.0, + "repeat_count": 0.0, + "routers_loss": 0.004785746335983276, + "skip_count": 0.0, + "step": 6742, + "text_loss": 0.5655979514122009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00032315400710627876, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10874165.0, + "repeat_count": 0.0, + "routers_loss": 0.0052397786639630795, + "skip_count": 0.0, + "step": 6744, + "text_loss": 0.4785873591899872 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 31.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003228645310348948, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10876919.0, + "repeat_count": 3.0, + "routers_loss": 0.00460197776556015, + "skip_count": 1.0, + "step": 6746, + "text_loss": 0.5683879256248474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0003225751228421566, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10880179.0, + "repeat_count": 0.0, + "routers_loss": 0.0032690472435206175, + "skip_count": 0.0, + "step": 6748, + "text_loss": 0.5268497467041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.00032228578263896607, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10883711.0, + "repeat_count": 0.0, + "routers_loss": 0.0036305058747529984, + "skip_count": 0.0, + "step": 6750, + "text_loss": 0.16675594449043274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003219965105361989, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10887041.0, + "repeat_count": 0.0, + "routers_loss": 0.002453352091833949, + "skip_count": 1.0, + "step": 6752, + "text_loss": 0.7010246515274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00032170730664470465, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10890053.0, + "repeat_count": 0.0, + "routers_loss": 0.0020381701178848743, + "skip_count": 0.0, + "step": 6754, + "text_loss": 0.46637895703315735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003214181710753069, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10893501.0, + "repeat_count": 0.0, + "routers_loss": 0.004525696858763695, + "skip_count": 0.0, + "step": 6756, + "text_loss": 0.1768684983253479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003211291039388026, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10896480.0, + "repeat_count": 1.0, + "routers_loss": 0.0038154330104589462, + "skip_count": 0.0, + "step": 6758, + "text_loss": 0.7908347845077515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00032084010534596326, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10899158.0, + "repeat_count": 0.0, + "routers_loss": 0.004711449146270752, + "skip_count": 2.0, + "step": 6760, + "text_loss": 0.37209007143974304 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003205511754075335, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10901791.0, + "repeat_count": 1.0, + "routers_loss": 0.0025003373157233, + "skip_count": 1.0, + "step": 6762, + "text_loss": 0.8081201314926147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00032026231423423204, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10904817.0, + "repeat_count": 0.0, + "routers_loss": 0.007387075573205948, + "skip_count": 3.0, + "step": 6764, + "text_loss": 0.30355480313301086 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003199735219367507, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 10908018.0, + "repeat_count": 2.0, + "routers_loss": 0.04275592789053917, + "skip_count": 0.0, + "step": 6766, + "text_loss": 0.26562029123306274 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.774875256824185, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003196847986257553, + "loss": 0.008, + "macro_f1": 0.9255813956260681, + "num_tokens": 10911264.0, + "repeat_count": 3.0, + "routers_loss": 0.034824032336473465, + "skip_count": 4.0, + "step": 6768, + "text_loss": 0.2761698067188263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00031939614441188523, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10915964.0, + "repeat_count": 0.0, + "routers_loss": 0.0011179742868989706, + "skip_count": 0.0, + "step": 6770, + "text_loss": 0.4107927083969116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00031910755940575344, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10918678.0, + "repeat_count": 0.0, + "routers_loss": 0.0011521469568833709, + "skip_count": 0.0, + "step": 6772, + "text_loss": 0.43064895272254944 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.000318819043717946, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10921757.0, + "repeat_count": 1.0, + "routers_loss": 0.002861087443307042, + "skip_count": 1.0, + "step": 6774, + "text_loss": 0.5945150852203369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003185305974590229, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10924767.0, + "repeat_count": 0.0, + "routers_loss": 0.0011365334503352642, + "skip_count": 0.0, + "step": 6776, + "text_loss": 0.36615172028541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0003182422207395171, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10927750.0, + "repeat_count": 1.0, + "routers_loss": 0.0034391419030725956, + "skip_count": 0.0, + "step": 6778, + "text_loss": 0.17081251740455627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003179539136699351, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10930817.0, + "repeat_count": 0.0, + "routers_loss": 0.004941808991134167, + "skip_count": 2.0, + "step": 6780, + "text_loss": 0.7683762311935425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.840622248312297, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.00031766567636075675, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 10933882.0, + "repeat_count": 1.0, + "routers_loss": 0.017502857372164726, + "skip_count": 2.0, + "step": 6782, + "text_loss": 0.38010457158088684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003173775089224353, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10936909.0, + "repeat_count": 1.0, + "routers_loss": 0.0035372809506952763, + "skip_count": 2.0, + "step": 6784, + "text_loss": 0.5760656595230103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00031708941146539707, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10940032.0, + "repeat_count": 1.0, + "routers_loss": 0.02229934185743332, + "skip_count": 0.0, + "step": 6786, + "text_loss": 0.5767728090286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00031680138410004123, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10943217.0, + "repeat_count": 0.0, + "routers_loss": 0.0028649091254919767, + "skip_count": 1.0, + "step": 6788, + "text_loss": 0.9756367802619934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00031651342693674066, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10947847.0, + "repeat_count": 0.0, + "routers_loss": 0.0039158593863248825, + "skip_count": 2.0, + "step": 6790, + "text_loss": 0.2504335045814514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000316225540085841, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10950879.0, + "repeat_count": 0.0, + "routers_loss": 0.0022091215942054987, + "skip_count": 0.0, + "step": 6792, + "text_loss": 0.525842547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00031593772365766105, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10954960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006841494468972087, + "skip_count": 0.0, + "step": 6794, + "text_loss": 0.6383582353591919 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.906369239800412, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003156499777624926, + "loss": 0.006, + "macro_f1": 0.9539539813995361, + "num_tokens": 10958278.0, + "repeat_count": 5.0, + "routers_loss": 0.03810702636837959, + "skip_count": 5.0, + "step": 6796, + "text_loss": 0.5901661515235901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0003153623025106005, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10962412.0, + "repeat_count": 0.0, + "routers_loss": 0.00046833412488922477, + "skip_count": 0.0, + "step": 6798, + "text_loss": 0.42693984508514404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00031507469801222233, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10966037.0, + "repeat_count": 0.0, + "routers_loss": 0.006818041671067476, + "skip_count": 2.0, + "step": 6800, + "text_loss": 0.5326262712478638 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00031478716437756876, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10969369.0, + "repeat_count": 0.0, + "routers_loss": 0.0029889161232858896, + "skip_count": 0.0, + "step": 6802, + "text_loss": 0.49028220772743225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003144997017168232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10972016.0, + "repeat_count": 0.0, + "routers_loss": 0.0038266500923782587, + "skip_count": 2.0, + "step": 6804, + "text_loss": 0.43391722440719604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0003142123101401417, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10975153.0, + "repeat_count": 0.0, + "routers_loss": 0.0005866789724677801, + "skip_count": 0.0, + "step": 6806, + "text_loss": 0.5888382196426392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00031392498975765353, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10977881.0, + "repeat_count": 0.0, + "routers_loss": 0.002122384263202548, + "skip_count": 0.0, + "step": 6808, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003136377406794604, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10982025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005535652744583786, + "skip_count": 0.0, + "step": 6810, + "text_loss": 0.5788959264755249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003133505630156365, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10985419.0, + "repeat_count": 0.0, + "routers_loss": 0.010623604990541935, + "skip_count": 2.0, + "step": 6812, + "text_loss": 0.18577243387699127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00031306345687622905, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10989116.0, + "repeat_count": 0.0, + "routers_loss": 0.0004721239674836397, + "skip_count": 0.0, + "step": 6814, + "text_loss": 0.4818301200866699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0003127764223712575, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10992064.0, + "repeat_count": 0.0, + "routers_loss": 0.0004238430701661855, + "skip_count": 0.0, + "step": 6816, + "text_loss": 0.7482771277427673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003124894596107141, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10994903.0, + "repeat_count": 1.0, + "routers_loss": 0.005224394146353006, + "skip_count": 2.0, + "step": 6818, + "text_loss": 0.186603844165802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00031220256870456356, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 10998692.0, + "repeat_count": 1.0, + "routers_loss": 0.0021751862950623035, + "skip_count": 2.0, + "step": 6820, + "text_loss": 0.45633986592292786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00031191574976274284, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11001284.0, + "repeat_count": 0.0, + "routers_loss": 0.004747046157717705, + "skip_count": 4.0, + "step": 6822, + "text_loss": 0.5651670694351196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003116290028951617, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 11004293.0, + "repeat_count": 0.0, + "routers_loss": 0.0008316585444845259, + "skip_count": 0.0, + "step": 6824, + "text_loss": 0.3167279362678528 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.000311342328211702, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11007080.0, + "repeat_count": 0.0, + "routers_loss": 0.0004732926026917994, + "skip_count": 0.0, + "step": 6826, + "text_loss": 0.49171411991119385 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000311055725822218, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11010078.0, + "repeat_count": 1.0, + "routers_loss": 0.004238729365170002, + "skip_count": 0.0, + "step": 6828, + "text_loss": 0.21484950184822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003107691958365361, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11013368.0, + "repeat_count": 0.0, + "routers_loss": 0.0029175232630223036, + "skip_count": 2.0, + "step": 6830, + "text_loss": 0.3718266189098358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003104827383644555, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11016704.0, + "repeat_count": 0.0, + "routers_loss": 0.00191891985014081, + "skip_count": 0.0, + "step": 6832, + "text_loss": 0.28772637248039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00031019635351574705, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 11019651.0, + "repeat_count": 0.0, + "routers_loss": 0.004300855100154877, + "skip_count": 2.0, + "step": 6834, + "text_loss": 0.6583508849143982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000309910041400154, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11023847.0, + "repeat_count": 0.0, + "routers_loss": 0.00037701442488469183, + "skip_count": 0.0, + "step": 6836, + "text_loss": 0.36090534925460815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 32.10331670090989, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0003096238021273917, + "loss": 0.0077, + "macro_f1": 0.9265305995941162, + "num_tokens": 11027804.0, + "repeat_count": 1.0, + "routers_loss": 0.03601725772023201, + "skip_count": 3.0, + "step": 6838, + "text_loss": 0.24180401861667633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.11270912826534, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00030933763580714757, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 11030778.0, + "repeat_count": 1.0, + "routers_loss": 0.023780640214681625, + "skip_count": 2.0, + "step": 6840, + "text_loss": 0.4978102743625641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030905154254908104, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11034863.0, + "repeat_count": 1.0, + "routers_loss": 0.00565778324380517, + "skip_count": 0.0, + "step": 6842, + "text_loss": 0.558772623538971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00030876552246282356, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11038488.0, + "repeat_count": 0.0, + "routers_loss": 0.010575232096016407, + "skip_count": 0.0, + "step": 6844, + "text_loss": 0.2955974340438843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003084795756579787, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11041796.0, + "repeat_count": 0.0, + "routers_loss": 0.0015910190995782614, + "skip_count": 0.0, + "step": 6846, + "text_loss": 0.5009704828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003081937022441217, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11045141.0, + "repeat_count": 0.0, + "routers_loss": 0.0008034126949496567, + "skip_count": 0.0, + "step": 6848, + "text_loss": 0.3965311646461487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003079079023307999, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11047814.0, + "repeat_count": 2.0, + "routers_loss": 0.00810160581022501, + "skip_count": 0.0, + "step": 6850, + "text_loss": 0.24341927468776703 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003076221760275321, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11051330.0, + "repeat_count": 1.0, + "routers_loss": 0.006590691395103931, + "skip_count": 0.0, + "step": 6852, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00030733652344380936, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11055006.0, + "repeat_count": 0.0, + "routers_loss": 0.0005845054984092712, + "skip_count": 0.0, + "step": 6854, + "text_loss": 0.6621366739273071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003070509446890944, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11058470.0, + "repeat_count": 0.0, + "routers_loss": 0.0041051446460187435, + "skip_count": 1.0, + "step": 6856, + "text_loss": 0.31603100895881653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0003067654398728214, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11061620.0, + "repeat_count": 1.0, + "routers_loss": 0.001603201380930841, + "skip_count": 0.0, + "step": 6858, + "text_loss": 0.5167516469955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00030648000910439636, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11064727.0, + "repeat_count": 0.0, + "routers_loss": 0.0024816282093524933, + "skip_count": 0.0, + "step": 6860, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030619465249319693, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11068208.0, + "repeat_count": 1.0, + "routers_loss": 0.003121294779703021, + "skip_count": 0.0, + "step": 6862, + "text_loss": 0.3920222818851471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0003059093701485722, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11071315.0, + "repeat_count": 0.0, + "routers_loss": 0.0033239589538425207, + "skip_count": 1.0, + "step": 6864, + "text_loss": 0.4201887845993042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00030562416217984296, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11074144.0, + "repeat_count": 0.0, + "routers_loss": 0.0016117560444399714, + "skip_count": 0.0, + "step": 6866, + "text_loss": 0.5283045172691345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0003053390286963015, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11077152.0, + "repeat_count": 0.0, + "routers_loss": 0.003879208816215396, + "skip_count": 0.0, + "step": 6868, + "text_loss": 0.16188788414001465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00030505396980721143, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11080200.0, + "repeat_count": 0.0, + "routers_loss": 0.007632353343069553, + "skip_count": 1.0, + "step": 6870, + "text_loss": 0.25986847281455994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00030476898562180793, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11083356.0, + "repeat_count": 0.0, + "routers_loss": 0.004322016146034002, + "skip_count": 2.0, + "step": 6872, + "text_loss": 0.49556297063827515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003044840762492974, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 11086354.0, + "repeat_count": 0.0, + "routers_loss": 0.0031272871419787407, + "skip_count": 2.0, + "step": 6874, + "text_loss": 0.1658666580915451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003041992417988577, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11088850.0, + "repeat_count": 0.0, + "routers_loss": 0.005371398758143187, + "skip_count": 2.0, + "step": 6876, + "text_loss": 0.22437214851379395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003039144823796378, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11091784.0, + "repeat_count": 0.0, + "routers_loss": 0.0025086402893066406, + "skip_count": 0.0, + "step": 6878, + "text_loss": 0.7293354868888855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003036297981007581, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11095204.0, + "repeat_count": 0.0, + "routers_loss": 0.015590827912092209, + "skip_count": 1.0, + "step": 6880, + "text_loss": 0.6406328678131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003033451890713103, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11098367.0, + "repeat_count": 0.0, + "routers_loss": 0.0013142531970515847, + "skip_count": 0.0, + "step": 6882, + "text_loss": 0.5209086537361145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003030606554003571, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 11101047.0, + "repeat_count": 2.0, + "routers_loss": 0.0018484699539840221, + "skip_count": 0.0, + "step": 6884, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00030277619719693217, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11104269.0, + "repeat_count": 0.0, + "routers_loss": 0.0016667681047692895, + "skip_count": 0.0, + "step": 6886, + "text_loss": 0.7918420433998108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0003024918145700406, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 11107248.0, + "repeat_count": 0.0, + "routers_loss": 0.0008098077378235757, + "skip_count": 0.0, + "step": 6888, + "text_loss": 0.3871288299560547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003022075076286582, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 11111204.0, + "repeat_count": 0.0, + "routers_loss": 0.002324736909940839, + "skip_count": 0.0, + "step": 6890, + "text_loss": 0.3722921907901764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003019232764817321, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11114363.0, + "repeat_count": 0.0, + "routers_loss": 0.00254769716411829, + "skip_count": 0.0, + "step": 6892, + "text_loss": 0.418519526720047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00030163912123818006, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11117718.0, + "repeat_count": 0.0, + "routers_loss": 0.000547234492842108, + "skip_count": 0.0, + "step": 6894, + "text_loss": 0.6087009310722351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003013550420068909, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11120437.0, + "repeat_count": 0.0, + "routers_loss": 0.00015221568173728883, + "skip_count": 0.0, + "step": 6896, + "text_loss": 0.6013991832733154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.385089521573235, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046142578125, + "learning_rate": 0.00030107103889672436, + "loss": 0.0085, + "macro_f1": 0.5492662787437439, + "num_tokens": 11123708.0, + "repeat_count": 0.0, + "routers_loss": 0.024048971012234688, + "skip_count": 2.0, + "step": 6898, + "text_loss": 0.3612423837184906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003007871120165111, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 11127294.0, + "repeat_count": 0.0, + "routers_loss": 0.0013236473314464092, + "skip_count": 0.0, + "step": 6900, + "text_loss": 0.5277031064033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00030050326147505226, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11130270.0, + "repeat_count": 0.0, + "routers_loss": 0.0028277861420065165, + "skip_count": 0.0, + "step": 6902, + "text_loss": 0.5726971626281738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003002194873811197, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11132955.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369837388396263, + "skip_count": 0.0, + "step": 6904, + "text_loss": 0.18510448932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00029993578984345673, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 11136387.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351211696863174, + "skip_count": 0.0, + "step": 6906, + "text_loss": 0.28313153982162476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0002996521689707764, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11139740.0, + "repeat_count": 0.0, + "routers_loss": 0.00032925375853665173, + "skip_count": 0.0, + "step": 6908, + "text_loss": 0.7315025329589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002993686248717629, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11142587.0, + "repeat_count": 0.0, + "routers_loss": 0.002886304398998618, + "skip_count": 0.0, + "step": 6910, + "text_loss": 0.677378237247467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029908515765507084, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 11145415.0, + "repeat_count": 1.0, + "routers_loss": 0.0038471966981887817, + "skip_count": 0.0, + "step": 6912, + "text_loss": 0.5207083225250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002988017674293254, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11148524.0, + "repeat_count": 0.0, + "routers_loss": 0.0023522782139480114, + "skip_count": 0.0, + "step": 6914, + "text_loss": 0.42507871985435486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0002985184543031222, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11152069.0, + "repeat_count": 0.0, + "routers_loss": 0.0012464249739423394, + "skip_count": 0.0, + "step": 6916, + "text_loss": 0.5694169998168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0002982352183850274, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11155675.0, + "repeat_count": 0.0, + "routers_loss": 0.00828156154602766, + "skip_count": 2.0, + "step": 6918, + "text_loss": 0.22304373979568481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00029795205978357754, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11158555.0, + "repeat_count": 0.0, + "routers_loss": 0.0019234733190387487, + "skip_count": 0.0, + "step": 6920, + "text_loss": 0.5519064664840698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0002976689786072795, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11161407.0, + "repeat_count": 0.0, + "routers_loss": 0.0003542431222740561, + "skip_count": 0.0, + "step": 6922, + "text_loss": 0.6748810410499573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002973859749646104, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11166007.0, + "repeat_count": 0.0, + "routers_loss": 0.0004024899681098759, + "skip_count": 0.0, + "step": 6924, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 32.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000297103048964018, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 11169007.0, + "repeat_count": 0.0, + "routers_loss": 0.005519595462828875, + "skip_count": 3.0, + "step": 6926, + "text_loss": 0.3815552592277527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00029682020071392, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11172939.0, + "repeat_count": 0.0, + "routers_loss": 0.0016999440267682076, + "skip_count": 0.0, + "step": 6928, + "text_loss": 0.6727893352508545 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.535368359260346, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002965374303227044, + "loss": 0.0055, + "macro_f1": 0.5492662787437439, + "num_tokens": 11176232.0, + "repeat_count": 2.0, + "routers_loss": 0.030950307846069336, + "skip_count": 0.0, + "step": 6930, + "text_loss": 0.5577763915061951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029625473789872923, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11179775.0, + "repeat_count": 0.0, + "routers_loss": 0.00525702815502882, + "skip_count": 1.0, + "step": 6932, + "text_loss": 0.5860039591789246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.000295972123550323, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11183262.0, + "repeat_count": 1.0, + "routers_loss": 0.0048187971115112305, + "skip_count": 2.0, + "step": 6934, + "text_loss": 0.7328732013702393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.00029568958738578364, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11186591.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159632312133908, + "skip_count": 0.0, + "step": 6936, + "text_loss": 0.40563541650772095 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 0.0002954071295133801, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11190056.0, + "repeat_count": 1.0, + "routers_loss": 0.011282073333859444, + "skip_count": 1.0, + "step": 6938, + "text_loss": 0.15986496210098267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002951247500413504, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11193504.0, + "repeat_count": 3.0, + "routers_loss": 0.010220487602055073, + "skip_count": 5.0, + "step": 6940, + "text_loss": 0.2604432702064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002948424490779029, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11196725.0, + "repeat_count": 0.0, + "routers_loss": 0.002620660001412034, + "skip_count": 1.0, + "step": 6942, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029456022673121597, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11199303.0, + "repeat_count": 0.0, + "routers_loss": 0.00042651945841498673, + "skip_count": 0.0, + "step": 6944, + "text_loss": 0.5135554671287537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0002942780831094377, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11202319.0, + "repeat_count": 0.0, + "routers_loss": 0.005366047378629446, + "skip_count": 2.0, + "step": 6946, + "text_loss": 0.2809196710586548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002939960183206861, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 11205622.0, + "repeat_count": 0.0, + "routers_loss": 0.0033479216508567333, + "skip_count": 0.0, + "step": 6948, + "text_loss": 0.2013140618801117 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00029371403247304887, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11208637.0, + "repeat_count": 1.0, + "routers_loss": 0.0013508419506251812, + "skip_count": 0.0, + "step": 6950, + "text_loss": 0.4427332580089569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002934321256745833, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11211618.0, + "repeat_count": 0.0, + "routers_loss": 0.0020944071002304554, + "skip_count": 0.0, + "step": 6952, + "text_loss": 0.5406652688980103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00029315029803331704, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11214432.0, + "repeat_count": 0.0, + "routers_loss": 0.0012655078899115324, + "skip_count": 0.0, + "step": 6954, + "text_loss": 0.7720552086830139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00029286854965724686, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11218127.0, + "repeat_count": 0.0, + "routers_loss": 0.009041395038366318, + "skip_count": 0.0, + "step": 6956, + "text_loss": 0.258109986782074 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0002925868806543391, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 11221440.0, + "repeat_count": 1.0, + "routers_loss": 0.0034558263141661882, + "skip_count": 1.0, + "step": 6958, + "text_loss": 0.5378029942512512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00029230529113253, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11225391.0, + "repeat_count": 0.0, + "routers_loss": 0.005263930186629295, + "skip_count": 2.0, + "step": 6960, + "text_loss": 0.3616539537906647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0002920237811997251, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11228648.0, + "repeat_count": 0.0, + "routers_loss": 0.003730480559170246, + "skip_count": 1.0, + "step": 6962, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00029174235096379963, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11231828.0, + "repeat_count": 0.0, + "routers_loss": 0.004831735976040363, + "skip_count": 1.0, + "step": 6964, + "text_loss": 0.5718355178833008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.70443205165835, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046875, + "learning_rate": 0.0002914610005325981, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 11234984.0, + "repeat_count": 0.0, + "routers_loss": 0.03880132734775543, + "skip_count": 2.0, + "step": 6966, + "text_loss": 0.3139013946056366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002911797300139345, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 11239153.0, + "repeat_count": 0.0, + "routers_loss": 0.0006673726020380855, + "skip_count": 0.0, + "step": 6968, + "text_loss": 0.6040399074554443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029089853951559235, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11242178.0, + "repeat_count": 1.0, + "routers_loss": 0.0028971200808882713, + "skip_count": 0.0, + "step": 6970, + "text_loss": 0.304967999458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00029061742914532427, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11245865.0, + "repeat_count": 0.0, + "routers_loss": 0.0010410466929897666, + "skip_count": 0.0, + "step": 6972, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0002903363990108524, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11248806.0, + "repeat_count": 0.0, + "routers_loss": 0.002133697969838977, + "skip_count": 0.0, + "step": 6974, + "text_loss": 0.2561415433883667 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0002900554492198677, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 11251807.0, + "repeat_count": 2.0, + "routers_loss": 0.002402493730187416, + "skip_count": 0.0, + "step": 6976, + "text_loss": 0.652428388595581 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0002897745798800311, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 11254615.0, + "repeat_count": 1.0, + "routers_loss": 0.006423915736377239, + "skip_count": 0.0, + "step": 6978, + "text_loss": 0.22414511442184448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.000289493791098972, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11257721.0, + "repeat_count": 0.0, + "routers_loss": 0.002536606043577194, + "skip_count": 0.0, + "step": 6980, + "text_loss": 0.1328018754720688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00028921308298428933, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11260840.0, + "repeat_count": 0.0, + "routers_loss": 0.000745086173992604, + "skip_count": 0.0, + "step": 6982, + "text_loss": 0.61724853515625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0002889324556435509, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11264279.0, + "repeat_count": 0.0, + "routers_loss": 0.005258981604129076, + "skip_count": 0.0, + "step": 6984, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028865190918429356, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11268096.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756023598834872, + "skip_count": 0.0, + "step": 6986, + "text_loss": 0.45111921429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00028837144371402336, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11270611.0, + "repeat_count": 0.0, + "routers_loss": 0.0008175788098014891, + "skip_count": 0.0, + "step": 6988, + "text_loss": 0.5332239270210266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00028809105934021517, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11273826.0, + "repeat_count": 0.0, + "routers_loss": 0.003494064789265394, + "skip_count": 0.0, + "step": 6990, + "text_loss": 0.20264241099357605 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002878107561703127, + "loss": 0.0056, + "macro_f1": 0.8817967176437378, + "num_tokens": 11276917.0, + "repeat_count": 2.0, + "routers_loss": 0.025257345288991928, + "skip_count": 3.0, + "step": 6992, + "text_loss": 0.18000070750713348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.835926034634575, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0002875305343117289, + "loss": 0.0044, + "macro_f1": 0.6603773832321167, + "num_tokens": 11279637.0, + "repeat_count": 1.0, + "routers_loss": 0.019206687808036804, + "skip_count": 1.0, + "step": 6994, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00028725039387184504, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11282717.0, + "repeat_count": 0.0, + "routers_loss": 0.009358765557408333, + "skip_count": 1.0, + "step": 6996, + "text_loss": 0.3412095904350281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00028697033495801163, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11285433.0, + "repeat_count": 1.0, + "routers_loss": 0.0038775671273469925, + "skip_count": 1.0, + "step": 6998, + "text_loss": 0.4316727817058563 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002866903576775475, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11288414.0, + "repeat_count": 1.0, + "routers_loss": 0.004292591474950314, + "skip_count": 0.0, + "step": 7000, + "text_loss": 0.45106515288352966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.873495744056356, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0002864104621377409, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 11291811.0, + "repeat_count": 1.0, + "routers_loss": 0.02195967361330986, + "skip_count": 2.0, + "step": 7002, + "text_loss": 0.29841285943984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002861306484458481, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11295179.0, + "repeat_count": 0.0, + "routers_loss": 0.0010119527578353882, + "skip_count": 0.0, + "step": 7004, + "text_loss": 0.5218569040298462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028585091670909436, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11298182.0, + "repeat_count": 0.0, + "routers_loss": 0.002615996403619647, + "skip_count": 0.0, + "step": 7006, + "text_loss": 0.20382621884346008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028557126703467316, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 11301262.0, + "repeat_count": 0.0, + "routers_loss": 0.002726050792261958, + "skip_count": 0.0, + "step": 7008, + "text_loss": 0.26718559861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002852916995297471, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 11304590.0, + "repeat_count": 0.0, + "routers_loss": 0.0005590448854491115, + "skip_count": 0.0, + "step": 7010, + "text_loss": 0.5392091274261475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028501221430144667, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11307690.0, + "repeat_count": 0.0, + "routers_loss": 0.004541353322565556, + "skip_count": 2.0, + "step": 7012, + "text_loss": 0.16159705817699432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00028473281145687137, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11310866.0, + "repeat_count": 0.0, + "routers_loss": 0.0029630991630256176, + "skip_count": 1.0, + "step": 7014, + "text_loss": 0.9148072600364685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 32.93924273554447, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002844534911030888, + "loss": 0.0067, + "macro_f1": 0.9262410998344421, + "num_tokens": 11314517.0, + "repeat_count": 2.0, + "routers_loss": 0.023258809000253677, + "skip_count": 3.0, + "step": 7016, + "text_loss": 0.3853590488433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000284174253347135, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 11317526.0, + "repeat_count": 0.0, + "routers_loss": 0.010060093365609646, + "skip_count": 1.0, + "step": 7018, + "text_loss": 0.3412325382232666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00028389509829601444, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 11321684.0, + "repeat_count": 0.0, + "routers_loss": 0.0016713893273845315, + "skip_count": 0.0, + "step": 7020, + "text_loss": 0.9049796462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00028361602605670003, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11324709.0, + "repeat_count": 0.0, + "routers_loss": 0.004167001228779554, + "skip_count": 2.0, + "step": 7022, + "text_loss": 0.24364058673381805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00028333703673613224, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11327449.0, + "repeat_count": 0.0, + "routers_loss": 0.0027954576071351767, + "skip_count": 4.0, + "step": 7024, + "text_loss": 0.2872125506401062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00028305813044122096, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11330846.0, + "repeat_count": 0.0, + "routers_loss": 0.004644687287509441, + "skip_count": 0.0, + "step": 7026, + "text_loss": 0.1717570424079895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.00028277930727884336, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11333575.0, + "repeat_count": 0.0, + "routers_loss": 0.00557848671451211, + "skip_count": 2.0, + "step": 7028, + "text_loss": 0.3501792550086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028250056735584496, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11336899.0, + "repeat_count": 0.0, + "routers_loss": 0.0005694970604963601, + "skip_count": 0.0, + "step": 7030, + "text_loss": 0.5541794300079346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028222191077903946, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11340163.0, + "repeat_count": 0.0, + "routers_loss": 0.0032896639313548803, + "skip_count": 0.0, + "step": 7032, + "text_loss": 0.5618721842765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00028194333765520853, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11343494.0, + "repeat_count": 1.0, + "routers_loss": 0.005377276800572872, + "skip_count": 0.0, + "step": 7034, + "text_loss": 0.325153648853302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00028166484809110206, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11346126.0, + "repeat_count": 0.0, + "routers_loss": 0.001204605447128415, + "skip_count": 0.0, + "step": 7036, + "text_loss": 0.5016651749610901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00028138644219343736, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 11348879.0, + "repeat_count": 0.0, + "routers_loss": 0.005026837810873985, + "skip_count": 2.0, + "step": 7038, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00028110812006890064, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11352457.0, + "repeat_count": 0.0, + "routers_loss": 0.0019850607495754957, + "skip_count": 0.0, + "step": 7040, + "text_loss": 0.42376917600631714 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00028082988182414524, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11356602.0, + "repeat_count": 1.0, + "routers_loss": 0.003362950636073947, + "skip_count": 2.0, + "step": 7042, + "text_loss": 0.4165397882461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0002805517275657926, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 11359451.0, + "repeat_count": 0.0, + "routers_loss": 0.0019725612364709377, + "skip_count": 1.0, + "step": 7044, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0002802736574004319, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11363614.0, + "repeat_count": 0.0, + "routers_loss": 0.0013963640667498112, + "skip_count": 0.0, + "step": 7046, + "text_loss": 0.6112356185913086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00027999567143462015, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11367015.0, + "repeat_count": 0.0, + "routers_loss": 0.0005658161826431751, + "skip_count": 0.0, + "step": 7048, + "text_loss": 0.4920886754989624 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.09862048723217, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00027971776977488193, + "loss": 0.0064, + "macro_f1": 0.925203263759613, + "num_tokens": 11370489.0, + "repeat_count": 3.0, + "routers_loss": 0.03657131269574165, + "skip_count": 5.0, + "step": 7050, + "text_loss": 0.28003939986228943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00027943995252771017, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11373614.0, + "repeat_count": 0.0, + "routers_loss": 0.004096088465303183, + "skip_count": 2.0, + "step": 7052, + "text_loss": 0.3145081400871277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00027916221979956457, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11377631.0, + "repeat_count": 0.0, + "routers_loss": 0.0009888096246868372, + "skip_count": 0.0, + "step": 7054, + "text_loss": 0.4898056983947754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.126797769298506, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00027888457169687297, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11380620.0, + "repeat_count": 1.0, + "routers_loss": 0.013347696512937546, + "skip_count": 1.0, + "step": 7056, + "text_loss": 0.7011964917182922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027860700832603056, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11383297.0, + "repeat_count": 0.0, + "routers_loss": 0.000849733711220324, + "skip_count": 1.0, + "step": 7058, + "text_loss": 0.4007014334201813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002783295297934003, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11386460.0, + "repeat_count": 0.0, + "routers_loss": 0.001546313869766891, + "skip_count": 1.0, + "step": 7060, + "text_loss": 0.3992713689804077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002780521362053123, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11389605.0, + "repeat_count": 0.0, + "routers_loss": 0.001045585609972477, + "skip_count": 0.0, + "step": 7062, + "text_loss": 0.4440680146217346 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027777482766806446, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 11392105.0, + "repeat_count": 1.0, + "routers_loss": 0.00752411549910903, + "skip_count": 0.0, + "step": 7064, + "text_loss": 0.20152349770069122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 33.17375990607572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002774976042879218, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 11396142.0, + "repeat_count": 0.0, + "routers_loss": 0.019917849451303482, + "skip_count": 3.0, + "step": 7066, + "text_loss": 0.24365149438381195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00027722046617111696, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 11398827.0, + "repeat_count": 1.0, + "routers_loss": 0.0015933843096718192, + "skip_count": 0.0, + "step": 7068, + "text_loss": 0.31948477029800415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00027694341342384977, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11402623.0, + "repeat_count": 0.0, + "routers_loss": 0.0018986845389008522, + "skip_count": 2.0, + "step": 7070, + "text_loss": 0.47721394896507263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00027666644615228727, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11405628.0, + "repeat_count": 0.0, + "routers_loss": 0.002975719515234232, + "skip_count": 1.0, + "step": 7072, + "text_loss": 0.3972358703613281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002763895644625637, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11409468.0, + "repeat_count": 0.0, + "routers_loss": 0.005657708737999201, + "skip_count": 1.0, + "step": 7074, + "text_loss": 0.6004229187965393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002761127684607811, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11412572.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351903203874826, + "skip_count": 2.0, + "step": 7076, + "text_loss": 1.0837591886520386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00027583605825300795, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 11416831.0, + "repeat_count": 2.0, + "routers_loss": 0.005529445596039295, + "skip_count": 2.0, + "step": 7078, + "text_loss": 0.575986921787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00027555943394528014, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11420557.0, + "repeat_count": 0.0, + "routers_loss": 0.006243749521672726, + "skip_count": 0.0, + "step": 7080, + "text_loss": 0.606263279914856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.248899324919286, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00027528289564360064, + "loss": 0.0058, + "macro_f1": 0.6603773832321167, + "num_tokens": 11423471.0, + "repeat_count": 1.0, + "routers_loss": 0.031515009701251984, + "skip_count": 1.0, + "step": 7082, + "text_loss": 0.19393208622932434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002750064434539394, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11426732.0, + "repeat_count": 0.0, + "routers_loss": 0.0005052287015132606, + "skip_count": 0.0, + "step": 7084, + "text_loss": 0.7202399969100952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00027473007748223357, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11429391.0, + "repeat_count": 0.0, + "routers_loss": 0.005099403206259012, + "skip_count": 1.0, + "step": 7086, + "text_loss": 0.20651355385780334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027445379783438685, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11432161.0, + "repeat_count": 0.0, + "routers_loss": 0.001447655027732253, + "skip_count": 0.0, + "step": 7088, + "text_loss": 0.34758952260017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00027417760461627037, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11435417.0, + "repeat_count": 0.0, + "routers_loss": 0.000808655982837081, + "skip_count": 0.0, + "step": 7090, + "text_loss": 0.7414838671684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00027390149793372177, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11438313.0, + "repeat_count": 0.0, + "routers_loss": 0.005151710007339716, + "skip_count": 0.0, + "step": 7092, + "text_loss": 0.17792417109012604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00027362547789254574, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11441681.0, + "repeat_count": 1.0, + "routers_loss": 0.0037353152874857187, + "skip_count": 3.0, + "step": 7094, + "text_loss": 0.5577781796455383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0002733495445985135, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 11444521.0, + "repeat_count": 0.0, + "routers_loss": 0.00038075417978689075, + "skip_count": 0.0, + "step": 7096, + "text_loss": 0.5052862167358398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.32403874376284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002730736981573632, + "loss": 0.0033, + "macro_f1": 0.3272727429866791, + "num_tokens": 11448481.0, + "repeat_count": 0.0, + "routers_loss": 0.007313522044569254, + "skip_count": 1.0, + "step": 7098, + "text_loss": 0.5869139432907104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002727979386748001, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11452164.0, + "repeat_count": 0.0, + "routers_loss": 0.0020673887338489294, + "skip_count": 0.0, + "step": 7100, + "text_loss": 0.4354212284088135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0002725222662564954, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11455995.0, + "repeat_count": 0.0, + "routers_loss": 0.0008315460290759802, + "skip_count": 0.0, + "step": 7102, + "text_loss": 0.8714128732681274 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.35221602582917, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0002722466810080874, + "loss": 0.0053, + "macro_f1": 0.6603773832321167, + "num_tokens": 11458828.0, + "repeat_count": 1.0, + "routers_loss": 0.010913078673183918, + "skip_count": 1.0, + "step": 7104, + "text_loss": 0.6226683855056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002719711830351809, + "loss": 0.0076, + "macro_f1": 0.6603773832321167, + "num_tokens": 11462448.0, + "repeat_count": 1.0, + "routers_loss": 0.040428292006254196, + "skip_count": 1.0, + "step": 7106, + "text_loss": 0.2543688118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027169577244334726, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11465796.0, + "repeat_count": 0.0, + "routers_loss": 0.004473939072340727, + "skip_count": 1.0, + "step": 7108, + "text_loss": 0.12356872111558914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00027142044933812424, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11469176.0, + "repeat_count": 0.0, + "routers_loss": 0.0017961655976250768, + "skip_count": 0.0, + "step": 7110, + "text_loss": 0.6800211668014526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0002711452138250162, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11471983.0, + "repeat_count": 2.0, + "routers_loss": 0.003279087832197547, + "skip_count": 2.0, + "step": 7112, + "text_loss": 0.340279757976532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.3991781626064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00027087006600949403, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11475656.0, + "repeat_count": 1.0, + "routers_loss": 0.017024178057909012, + "skip_count": 1.0, + "step": 7114, + "text_loss": 0.3556337058544159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0002705950059969948, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11479410.0, + "repeat_count": 0.0, + "routers_loss": 0.015487123280763626, + "skip_count": 1.0, + "step": 7116, + "text_loss": 0.4404350817203522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00027032003389292194, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11483302.0, + "repeat_count": 0.0, + "routers_loss": 0.0011217560386285186, + "skip_count": 0.0, + "step": 7118, + "text_loss": 0.46771445870399475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0002700451498026454, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11486212.0, + "repeat_count": 0.0, + "routers_loss": 0.0010832607513293624, + "skip_count": 0.0, + "step": 7120, + "text_loss": 0.6795281767845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00026977035383150106, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11489320.0, + "repeat_count": 0.0, + "routers_loss": 0.002290027216076851, + "skip_count": 1.0, + "step": 7122, + "text_loss": 0.5304523706436157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 33.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00026949564608479164, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11492056.0, + "repeat_count": 2.0, + "routers_loss": 0.009950211271643639, + "skip_count": 6.0, + "step": 7124, + "text_loss": 0.21328973770141602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0002692210266677855, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 11495165.0, + "repeat_count": 0.0, + "routers_loss": 0.0079165268689394, + "skip_count": 3.0, + "step": 7126, + "text_loss": 0.19840657711029053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00026894649568571724, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11497636.0, + "repeat_count": 0.0, + "routers_loss": 0.0013852717820554972, + "skip_count": 0.0, + "step": 7128, + "text_loss": 0.3360055088996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00026867205324378776, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11500806.0, + "repeat_count": 0.0, + "routers_loss": 0.0010151927126571536, + "skip_count": 0.0, + "step": 7130, + "text_loss": 0.6827390193939209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00026839769944716373, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11504187.0, + "repeat_count": 0.0, + "routers_loss": 0.001110393786802888, + "skip_count": 0.0, + "step": 7132, + "text_loss": 0.5081584453582764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002681234344009783, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 11507900.0, + "repeat_count": 0.0, + "routers_loss": 0.010587670840322971, + "skip_count": 1.0, + "step": 7134, + "text_loss": 0.28684356808662415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00026784925821033014, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11510627.0, + "repeat_count": 0.0, + "routers_loss": 0.006658690981566906, + "skip_count": 0.0, + "step": 7136, + "text_loss": 0.24232104420661926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00026757517098028417, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11513304.0, + "repeat_count": 0.0, + "routers_loss": 0.0014556109672412276, + "skip_count": 0.0, + "step": 7138, + "text_loss": 0.4718358516693115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 33.52127971822718, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00026730117281587116, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 11516593.0, + "repeat_count": 1.0, + "routers_loss": 0.01590067707002163, + "skip_count": 3.0, + "step": 7140, + "text_loss": 0.2810344696044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00026702726382208774, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11519776.0, + "repeat_count": 0.0, + "routers_loss": 0.0014479428064078093, + "skip_count": 0.0, + "step": 7142, + "text_loss": 0.48876339197158813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00026675344410389623, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11522499.0, + "repeat_count": 0.0, + "routers_loss": 0.003729258431121707, + "skip_count": 2.0, + "step": 7144, + "text_loss": 0.5350890755653381 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0002664797137662248, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11525220.0, + "repeat_count": 1.0, + "routers_loss": 0.0015156447188928723, + "skip_count": 1.0, + "step": 7146, + "text_loss": 0.5742373466491699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00026620607291396773, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 11527926.0, + "repeat_count": 2.0, + "routers_loss": 0.004842780064791441, + "skip_count": 2.0, + "step": 7148, + "text_loss": 0.4994547665119171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00026593252165198455, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 11531622.0, + "repeat_count": 0.0, + "routers_loss": 0.0026556351222097874, + "skip_count": 0.0, + "step": 7150, + "text_loss": 0.1567893922328949 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00026565906008510064, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11535191.0, + "repeat_count": 0.0, + "routers_loss": 0.008135059848427773, + "skip_count": 1.0, + "step": 7152, + "text_loss": 0.289173424243927 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000265385688318107, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 11539060.0, + "repeat_count": 1.0, + "routers_loss": 0.0020754633005708456, + "skip_count": 1.0, + "step": 7154, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002651124064557602, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11541662.0, + "repeat_count": 1.0, + "routers_loss": 0.0023738413583487272, + "skip_count": 0.0, + "step": 7156, + "text_loss": 0.5026801228523254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00026483921460278227, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11544763.0, + "repeat_count": 0.0, + "routers_loss": 0.003311366541311145, + "skip_count": 1.0, + "step": 7158, + "text_loss": 0.22975654900074005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0002645661128638609, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 11547649.0, + "repeat_count": 0.0, + "routers_loss": 0.0008209354127757251, + "skip_count": 0.0, + "step": 7160, + "text_loss": 0.32840636372566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00026429310134364926, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 11550648.0, + "repeat_count": 0.0, + "routers_loss": 0.0028574815951287746, + "skip_count": 0.0, + "step": 7162, + "text_loss": 0.23239612579345703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00026402018014676584, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11553790.0, + "repeat_count": 0.0, + "routers_loss": 0.005469404626637697, + "skip_count": 1.0, + "step": 7164, + "text_loss": 0.22877025604248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0002637473493777943, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11556802.0, + "repeat_count": 1.0, + "routers_loss": 0.0032242932356894016, + "skip_count": 2.0, + "step": 7166, + "text_loss": 0.6376226544380188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00026347460914128443, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 11559607.0, + "repeat_count": 1.0, + "routers_loss": 0.0040627880953252316, + "skip_count": 2.0, + "step": 7168, + "text_loss": 0.6879657506942749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00026320195954175043, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 11562677.0, + "repeat_count": 2.0, + "routers_loss": 0.020494163036346436, + "skip_count": 4.0, + "step": 7170, + "text_loss": 0.3710069954395294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.00026292940068367224, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11565948.0, + "repeat_count": 0.0, + "routers_loss": 0.002662271959707141, + "skip_count": 0.0, + "step": 7172, + "text_loss": 0.15041157603263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00026265693267149494, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11568836.0, + "repeat_count": 0.0, + "routers_loss": 0.0039914860390126705, + "skip_count": 1.0, + "step": 7174, + "text_loss": 0.5372130870819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.00026238455560962884, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11572542.0, + "repeat_count": 0.0, + "routers_loss": 0.0034708199091255665, + "skip_count": 0.0, + "step": 7176, + "text_loss": 0.2956286072731018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026211226960244914, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11575352.0, + "repeat_count": 0.0, + "routers_loss": 0.007794995326548815, + "skip_count": 2.0, + "step": 7178, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0002618400747542964, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11579110.0, + "repeat_count": 0.0, + "routers_loss": 0.0009694626205600798, + "skip_count": 0.0, + "step": 7180, + "text_loss": 0.6523211598396301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002615679711694764, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11582476.0, + "repeat_count": 0.0, + "routers_loss": 0.004227840341627598, + "skip_count": 1.0, + "step": 7182, + "text_loss": 0.1997286081314087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026129595895225965, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 11585685.0, + "repeat_count": 0.0, + "routers_loss": 0.00126146269030869, + "skip_count": 0.0, + "step": 7184, + "text_loss": 0.486299604177475 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.73730554740241, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002610240382068818, + "loss": 0.006, + "macro_f1": 0.8814815282821655, + "num_tokens": 11588804.0, + "repeat_count": 2.0, + "routers_loss": 0.04553814232349396, + "skip_count": 4.0, + "step": 7186, + "text_loss": 0.1622236669063568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00026075220903754324, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11591822.0, + "repeat_count": 0.0, + "routers_loss": 0.002460496500134468, + "skip_count": 2.0, + "step": 7188, + "text_loss": 0.5573232173919678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002604804715484095, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11594899.0, + "repeat_count": 0.0, + "routers_loss": 0.006854622159153223, + "skip_count": 1.0, + "step": 7190, + "text_loss": 0.4753095507621765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00026020882584361094, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11598333.0, + "repeat_count": 0.0, + "routers_loss": 0.001945660449564457, + "skip_count": 1.0, + "step": 7192, + "text_loss": 0.8912903666496277 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 33.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002599372720272426, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11601814.0, + "repeat_count": 4.0, + "routers_loss": 0.005749753676354885, + "skip_count": 1.0, + "step": 7194, + "text_loss": 0.6041871905326843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002596658102033643, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 11604661.0, + "repeat_count": 0.0, + "routers_loss": 0.0025942171923816204, + "skip_count": 1.0, + "step": 7196, + "text_loss": 0.4760607182979584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 33.793660111535075, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00025939444047600114, + "loss": 0.0075, + "macro_f1": 0.8807588815689087, + "num_tokens": 11608459.0, + "repeat_count": 2.0, + "routers_loss": 0.020141327753663063, + "skip_count": 6.0, + "step": 7198, + "text_loss": 0.6670252084732056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0002591231629491423, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11611489.0, + "repeat_count": 0.0, + "routers_loss": 0.005721202120184898, + "skip_count": 1.0, + "step": 7200, + "text_loss": 0.31318753957748413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00025885197772674174, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11615234.0, + "repeat_count": 0.0, + "routers_loss": 0.0027279339265078306, + "skip_count": 1.0, + "step": 7202, + "text_loss": 0.25728851556777954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00025858088491271825, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11618892.0, + "repeat_count": 0.0, + "routers_loss": 0.0006987092201597989, + "skip_count": 0.0, + "step": 7204, + "text_loss": 0.5504243969917297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00025830988461095504, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11622237.0, + "repeat_count": 0.0, + "routers_loss": 0.0029056845232844353, + "skip_count": 0.0, + "step": 7206, + "text_loss": 0.5319080948829651 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0002580389769253001, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11624713.0, + "repeat_count": 4.0, + "routers_loss": 0.007346974220126867, + "skip_count": 5.0, + "step": 7208, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0002577681619595655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11628689.0, + "repeat_count": 0.0, + "routers_loss": 0.0004166684520896524, + "skip_count": 0.0, + "step": 7210, + "text_loss": 0.37282413244247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00025749743981752824, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11631581.0, + "repeat_count": 0.0, + "routers_loss": 0.013194780796766281, + "skip_count": 2.0, + "step": 7212, + "text_loss": 0.220115065574646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0002572268106029295, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11634503.0, + "repeat_count": 0.0, + "routers_loss": 0.0009112557163462043, + "skip_count": 0.0, + "step": 7214, + "text_loss": 0.5631879568099976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00025695627441947496, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 11637790.0, + "repeat_count": 0.0, + "routers_loss": 0.011178883723914623, + "skip_count": 2.0, + "step": 7216, + "text_loss": 0.24482154846191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.887584385089525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00025668583137083447, + "loss": 0.0047, + "macro_f1": 0.32098764181137085, + "num_tokens": 11640806.0, + "repeat_count": 0.0, + "routers_loss": 0.01877705194056034, + "skip_count": 2.0, + "step": 7218, + "text_loss": 0.2229214459657669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002564154815606422, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11644479.0, + "repeat_count": 0.0, + "routers_loss": 0.0030277224723249674, + "skip_count": 0.0, + "step": 7220, + "text_loss": 0.6025711894035339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00025614522509249715, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11647340.0, + "repeat_count": 0.0, + "routers_loss": 0.002354414900764823, + "skip_count": 1.0, + "step": 7222, + "text_loss": 0.6497155427932739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002558750620699618, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11650433.0, + "repeat_count": 1.0, + "routers_loss": 0.009801039472222328, + "skip_count": 2.0, + "step": 7224, + "text_loss": 0.32049307227134705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002556049925965632, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11654451.0, + "repeat_count": 0.0, + "routers_loss": 0.002949854824692011, + "skip_count": 0.0, + "step": 7226, + "text_loss": 0.17923395335674286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00025533501677579254, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11657440.0, + "repeat_count": 1.0, + "routers_loss": 0.0032915703486651182, + "skip_count": 1.0, + "step": 7228, + "text_loss": 0.60064297914505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0002550651347111049, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11660599.0, + "repeat_count": 1.0, + "routers_loss": 0.00594533933326602, + "skip_count": 1.0, + "step": 7230, + "text_loss": 0.32829397916793823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00025479534650591976, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11663387.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214308466762304, + "skip_count": 0.0, + "step": 7232, + "text_loss": 0.7317177653312683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00025452565226362036, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11666729.0, + "repeat_count": 0.0, + "routers_loss": 0.0056374757550656796, + "skip_count": 2.0, + "step": 7234, + "text_loss": 0.3394623398780823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.00025425605208755406, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11669871.0, + "repeat_count": 0.0, + "routers_loss": 0.006422565318644047, + "skip_count": 3.0, + "step": 7236, + "text_loss": 0.1725512444972992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002539865460810322, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 11673008.0, + "repeat_count": 1.0, + "routers_loss": 0.0023537934757769108, + "skip_count": 0.0, + "step": 7238, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00025371713434733, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11675988.0, + "repeat_count": 0.0, + "routers_loss": 0.0026300614699721336, + "skip_count": 1.0, + "step": 7240, + "text_loss": 0.4877084195613861 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 34.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002534478169896864, + "loss": 0.0052, + "macro_f1": 0.9265305995941162, + "num_tokens": 11679068.0, + "repeat_count": 1.0, + "routers_loss": 0.019549336284399033, + "skip_count": 3.0, + "step": 7242, + "text_loss": 0.15101417899131775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002531785941113044, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11682205.0, + "repeat_count": 0.0, + "routers_loss": 0.007769173942506313, + "skip_count": 1.0, + "step": 7244, + "text_loss": 0.4035153090953827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0002529094658153508, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11685162.0, + "repeat_count": 0.0, + "routers_loss": 0.003636054927483201, + "skip_count": 0.0, + "step": 7246, + "text_loss": 0.21048080921173096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.00025264043220495606, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 11688512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013363865436986089, + "skip_count": 0.0, + "step": 7248, + "text_loss": 0.6582038402557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00025237149338321437, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11691753.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587349878624082, + "skip_count": 0.0, + "step": 7250, + "text_loss": 0.6899203658103943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0002521026494531835, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11694689.0, + "repeat_count": 1.0, + "routers_loss": 0.006221035961061716, + "skip_count": 0.0, + "step": 7252, + "text_loss": 0.17377600073814392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.000251833900517885, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 11697950.0, + "repeat_count": 0.0, + "routers_loss": 0.004368607886135578, + "skip_count": 1.0, + "step": 7254, + "text_loss": 0.4147649109363556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.000251565246680304, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11701214.0, + "repeat_count": 0.0, + "routers_loss": 0.0038269520737230778, + "skip_count": 2.0, + "step": 7256, + "text_loss": 0.42076823115348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00025129668804338906, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11703935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011755652958527207, + "skip_count": 0.0, + "step": 7258, + "text_loss": 0.5484340190887451 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00025102822471005247, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11706818.0, + "repeat_count": 1.0, + "routers_loss": 0.00735129788517952, + "skip_count": 2.0, + "step": 7260, + "text_loss": 0.29214802384376526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00025075985678316983, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 11709979.0, + "repeat_count": 1.0, + "routers_loss": 0.0011552777141332626, + "skip_count": 0.0, + "step": 7262, + "text_loss": 0.6514551639556885 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.10331670090989, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002504915843655802, + "loss": 0.0067, + "macro_f1": 0.8814815282821655, + "num_tokens": 11714075.0, + "repeat_count": 2.0, + "routers_loss": 0.01438678614795208, + "skip_count": 4.0, + "step": 7264, + "text_loss": 0.5144859552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002502234075600862, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11717610.0, + "repeat_count": 0.0, + "routers_loss": 0.0027831171173602343, + "skip_count": 0.0, + "step": 7266, + "text_loss": 0.6494308114051819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00024995532646945336, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 11721415.0, + "repeat_count": 0.0, + "routers_loss": 0.0012327058939263225, + "skip_count": 0.0, + "step": 7268, + "text_loss": 0.5111991763114929 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 34.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002496873411964113, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11724488.0, + "repeat_count": 2.0, + "routers_loss": 0.003060065908357501, + "skip_count": 1.0, + "step": 7270, + "text_loss": 0.5780492424964905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002494194518436523, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11727708.0, + "repeat_count": 0.0, + "routers_loss": 0.001369593315757811, + "skip_count": 0.0, + "step": 7272, + "text_loss": 0.3151950240135193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00024915165851383203, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11730897.0, + "repeat_count": 0.0, + "routers_loss": 0.005724756047129631, + "skip_count": 0.0, + "step": 7274, + "text_loss": 0.5267965197563171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00024888396130956947, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11733870.0, + "repeat_count": 1.0, + "routers_loss": 0.010036137886345387, + "skip_count": 0.0, + "step": 7276, + "text_loss": 0.5330777168273926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00024861636033344657, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11737413.0, + "repeat_count": 0.0, + "routers_loss": 0.008341848850250244, + "skip_count": 2.0, + "step": 7278, + "text_loss": 0.25949522852897644 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0002483488556880087, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 11740691.0, + "repeat_count": 1.0, + "routers_loss": 0.008208763785660267, + "skip_count": 2.0, + "step": 7280, + "text_loss": 0.1867891401052475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000248081447475764, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11743715.0, + "repeat_count": 0.0, + "routers_loss": 0.0038434381131082773, + "skip_count": 0.0, + "step": 7282, + "text_loss": 0.4835410416126251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002478141357991838, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11746818.0, + "repeat_count": 0.0, + "routers_loss": 0.0019067893736064434, + "skip_count": 0.0, + "step": 7284, + "text_loss": 0.5959038734436035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00024754692076070256, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11750160.0, + "repeat_count": 0.0, + "routers_loss": 0.007199060171842575, + "skip_count": 0.0, + "step": 7286, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002472798024627175, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11752836.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214382972568274, + "skip_count": 0.0, + "step": 7288, + "text_loss": 0.5742631554603577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002470127810075889, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11756276.0, + "repeat_count": 0.0, + "routers_loss": 0.0018025166355073452, + "skip_count": 0.0, + "step": 7290, + "text_loss": 0.6616888642311096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00024674585649763983, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11760235.0, + "repeat_count": 1.0, + "routers_loss": 0.0024077212437987328, + "skip_count": 0.0, + "step": 7292, + "text_loss": 0.7984768748283386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024647902903515614, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 11763430.0, + "repeat_count": 0.0, + "routers_loss": 0.007843999192118645, + "skip_count": 1.0, + "step": 7294, + "text_loss": 0.1943647861480713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0002462122987223869, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 11766583.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727738108485937, + "skip_count": 0.0, + "step": 7296, + "text_loss": 0.43924200534820557 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 27.0, + "epoch": 34.26298796595245, + "f1_execute": 0.9545454382896423, + "f1_repeat": 1.0, + "f1_skip": 0.75, + "grad_norm": 0.041015625, + "learning_rate": 0.0002459456656615436, + "loss": 0.0069, + "macro_f1": 0.9015151858329773, + "num_tokens": 11770360.0, + "repeat_count": 2.0, + "routers_loss": 0.04594529792666435, + "skip_count": 5.0, + "step": 7298, + "text_loss": 0.32582250237464905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002456791299548004, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11773239.0, + "repeat_count": 1.0, + "routers_loss": 0.0011880286037921906, + "skip_count": 0.0, + "step": 7300, + "text_loss": 0.7723727226257324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024541269170429435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11776945.0, + "repeat_count": 0.0, + "routers_loss": 0.0010577787179499865, + "skip_count": 0.0, + "step": 7302, + "text_loss": 0.8173839449882507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002451463510121252, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11780121.0, + "repeat_count": 0.0, + "routers_loss": 0.0019757342524826527, + "skip_count": 0.0, + "step": 7304, + "text_loss": 0.4015064239501953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000244880107980355, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 11783172.0, + "repeat_count": 0.0, + "routers_loss": 0.002577328821644187, + "skip_count": 0.0, + "step": 7306, + "text_loss": 0.5465171933174133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00024461396271100876, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 11788608.0, + "repeat_count": 0.0, + "routers_loss": 0.004162502940744162, + "skip_count": 0.0, + "step": 7308, + "text_loss": 0.2419646978378296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0002443479153060735, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11791912.0, + "repeat_count": 0.0, + "routers_loss": 0.003301614662632346, + "skip_count": 0.0, + "step": 7310, + "text_loss": 0.2568489909172058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00024408196586749964, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11794849.0, + "repeat_count": 0.0, + "routers_loss": 0.0019893983844667673, + "skip_count": 0.0, + "step": 7312, + "text_loss": 0.7044196128845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0002438161144971992, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11797587.0, + "repeat_count": 0.0, + "routers_loss": 0.006637922488152981, + "skip_count": 1.0, + "step": 7314, + "text_loss": 0.6863232254981995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000243550361297047, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11800173.0, + "repeat_count": 0.0, + "routers_loss": 0.003078785724937916, + "skip_count": 2.0, + "step": 7316, + "text_loss": 0.2868897616863251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00024328470636888005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11802889.0, + "repeat_count": 0.0, + "routers_loss": 0.0011882453691214323, + "skip_count": 0.0, + "step": 7318, + "text_loss": 0.5522798299789429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0002430191498144979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11805607.0, + "repeat_count": 0.0, + "routers_loss": 0.0008720619371160865, + "skip_count": 0.0, + "step": 7320, + "text_loss": 0.5531370639801025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00024275369173566236, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11808838.0, + "repeat_count": 1.0, + "routers_loss": 0.003213440766558051, + "skip_count": 0.0, + "step": 7322, + "text_loss": 0.5252627730369568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.00024248833223409715, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 11811965.0, + "repeat_count": 0.0, + "routers_loss": 0.004736232105642557, + "skip_count": 1.0, + "step": 7324, + "text_loss": 0.6033701300621033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00024222307141148907, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11814832.0, + "repeat_count": 0.0, + "routers_loss": 0.0007559265359304845, + "skip_count": 0.0, + "step": 7326, + "text_loss": 0.5607737302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00024195790936948626, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11818802.0, + "repeat_count": 0.0, + "routers_loss": 0.005338212475180626, + "skip_count": 2.0, + "step": 7328, + "text_loss": 0.20618735253810883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002416928462096994, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11821998.0, + "repeat_count": 0.0, + "routers_loss": 0.001919696107506752, + "skip_count": 3.0, + "step": 7330, + "text_loss": 0.42486369609832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00024142788203370107, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797834981232882, + "skip_count": 0.0, + "step": 7332, + "text_loss": 0.48403388261795044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00024116301694302621, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 11828504.0, + "repeat_count": 0.0, + "routers_loss": 0.008978237397968769, + "skip_count": 1.0, + "step": 7334, + "text_loss": 0.43872755765914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00024089825103917152, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11831171.0, + "repeat_count": 0.0, + "routers_loss": 0.004589964635670185, + "skip_count": 1.0, + "step": 7336, + "text_loss": 0.5126842260360718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024063358442359572, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11834387.0, + "repeat_count": 0.0, + "routers_loss": 0.002857893006876111, + "skip_count": 0.0, + "step": 7338, + "text_loss": 0.7521272301673889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0002403690171977197, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11838693.0, + "repeat_count": 0.0, + "routers_loss": 0.0009023012826219201, + "skip_count": 0.0, + "step": 7340, + "text_loss": 0.6335242390632629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00024010454946292586, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11841882.0, + "repeat_count": 1.0, + "routers_loss": 0.010992717929184437, + "skip_count": 0.0, + "step": 7342, + "text_loss": 0.64045649766922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002398401813205592, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11845181.0, + "repeat_count": 0.0, + "routers_loss": 0.002247930970042944, + "skip_count": 2.0, + "step": 7344, + "text_loss": 0.31022098660469055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00023957591287192577, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11848537.0, + "repeat_count": 0.0, + "routers_loss": 0.003184020286425948, + "skip_count": 2.0, + "step": 7346, + "text_loss": 0.5709269642829895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00023931174421829376, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 11851437.0, + "repeat_count": 2.0, + "routers_loss": 0.006582654081285, + "skip_count": 4.0, + "step": 7348, + "text_loss": 0.3547070026397705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00023904767546089318, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11854161.0, + "repeat_count": 1.0, + "routers_loss": 0.0022124287206679583, + "skip_count": 0.0, + "step": 7350, + "text_loss": 0.6984702348709106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023878370670091565, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11856811.0, + "repeat_count": 1.0, + "routers_loss": 0.0029868825804442167, + "skip_count": 0.0, + "step": 7352, + "text_loss": 0.25389090180397034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00023851983803951444, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11860110.0, + "repeat_count": 0.0, + "routers_loss": 0.0028468978125602007, + "skip_count": 1.0, + "step": 7354, + "text_loss": 0.5729252099990845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00023825606957780454, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11863058.0, + "repeat_count": 1.0, + "routers_loss": 0.003115740604698658, + "skip_count": 2.0, + "step": 7356, + "text_loss": 0.60753333568573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00023799240141686258, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11865865.0, + "repeat_count": 0.0, + "routers_loss": 0.0022254586219787598, + "skip_count": 0.0, + "step": 7358, + "text_loss": 0.2568866014480591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00023772883365772658, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11869133.0, + "repeat_count": 0.0, + "routers_loss": 0.0017388637643307447, + "skip_count": 0.0, + "step": 7360, + "text_loss": 0.7657097578048706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00023746536640139633, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11872988.0, + "repeat_count": 0.0, + "routers_loss": 0.002158832037821412, + "skip_count": 0.0, + "step": 7362, + "text_loss": 0.19717472791671753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00023720199974883294, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11875810.0, + "repeat_count": 0.0, + "routers_loss": 0.001037398586049676, + "skip_count": 0.0, + "step": 7364, + "text_loss": 0.47334593534469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00023693873380095876, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11878558.0, + "repeat_count": 0.0, + "routers_loss": 0.011853457428514957, + "skip_count": 5.0, + "step": 7366, + "text_loss": 0.2567826211452484 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00023667556865865824, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 11881473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015339091187343001, + "skip_count": 0.0, + "step": 7368, + "text_loss": 0.40981143712997437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00023641250442277655, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11885033.0, + "repeat_count": 1.0, + "routers_loss": 0.010062574408948421, + "skip_count": 0.0, + "step": 7370, + "text_loss": 0.3183043301105499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00023614954119412042, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11889136.0, + "repeat_count": 0.0, + "routers_loss": 0.0010769609361886978, + "skip_count": 0.0, + "step": 7372, + "text_loss": 0.5279555916786194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00023588667907345785, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11893102.0, + "repeat_count": 0.0, + "routers_loss": 0.0032862431835383177, + "skip_count": 3.0, + "step": 7374, + "text_loss": 0.5425930023193359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 34.629292632814796, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0341796875, + "learning_rate": 0.00023562391816151808, + "loss": 0.0057, + "macro_f1": 0.5934640765190125, + "num_tokens": 11895841.0, + "repeat_count": 0.0, + "routers_loss": 0.02405562624335289, + "skip_count": 3.0, + "step": 7376, + "text_loss": 0.26054954528808594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00023536125855899153, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11899594.0, + "repeat_count": 1.0, + "routers_loss": 0.008315852843225002, + "skip_count": 3.0, + "step": 7378, + "text_loss": 0.19068174064159393 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00023509870036652998, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11902843.0, + "repeat_count": 1.0, + "routers_loss": 0.006180883850902319, + "skip_count": 4.0, + "step": 7380, + "text_loss": 0.18461982905864716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023483624368474614, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11905786.0, + "repeat_count": 0.0, + "routers_loss": 0.0008856299100443721, + "skip_count": 0.0, + "step": 7382, + "text_loss": 0.5216618180274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.66686234223657, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00023457388861421397, + "loss": 0.0059, + "macro_f1": 0.32098764181137085, + "num_tokens": 11908706.0, + "repeat_count": 1.0, + "routers_loss": 0.04762765392661095, + "skip_count": 1.0, + "step": 7384, + "text_loss": 0.25329193472862244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00023431163525546833, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11911862.0, + "repeat_count": 1.0, + "routers_loss": 0.000989250373095274, + "skip_count": 1.0, + "step": 7386, + "text_loss": 0.2657507658004761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0002340494837090053, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11915483.0, + "repeat_count": 0.0, + "routers_loss": 0.0008857969660311937, + "skip_count": 0.0, + "step": 7388, + "text_loss": 0.5136669874191284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00023378743407528164, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11918778.0, + "repeat_count": 0.0, + "routers_loss": 0.0041572838090360165, + "skip_count": 1.0, + "step": 7390, + "text_loss": 0.5212553143501282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00023352548645471556, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11921916.0, + "repeat_count": 0.0, + "routers_loss": 0.0010537431808188558, + "skip_count": 0.0, + "step": 7392, + "text_loss": 0.48122525215148926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00023326364094768576, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11924273.0, + "repeat_count": 1.0, + "routers_loss": 0.004077036865055561, + "skip_count": 0.0, + "step": 7394, + "text_loss": 0.2128690630197525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00023300189765453194, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11927424.0, + "repeat_count": 0.0, + "routers_loss": 0.005371362902224064, + "skip_count": 2.0, + "step": 7396, + "text_loss": 0.19448284804821014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00023274025667555464, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11930919.0, + "repeat_count": 0.0, + "routers_loss": 0.002137752715498209, + "skip_count": 0.0, + "step": 7398, + "text_loss": 0.7537064552307129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00023247871811101512, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11933680.0, + "repeat_count": 0.0, + "routers_loss": 0.0002398790093138814, + "skip_count": 0.0, + "step": 7400, + "text_loss": 0.5589297413825989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.751394188435576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00023221728206113546, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 11937090.0, + "repeat_count": 0.0, + "routers_loss": 0.019718777388334274, + "skip_count": 1.0, + "step": 7402, + "text_loss": 0.8014751672744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0002319559486260985, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11940581.0, + "repeat_count": 0.0, + "routers_loss": 0.001230534864589572, + "skip_count": 0.0, + "step": 7404, + "text_loss": 0.5218383073806763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002316947179060477, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11943832.0, + "repeat_count": 0.0, + "routers_loss": 0.0016393321566283703, + "skip_count": 0.0, + "step": 7406, + "text_loss": 0.17122556269168854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023143359000108704, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11947025.0, + "repeat_count": 0.0, + "routers_loss": 0.005269679240882397, + "skip_count": 2.0, + "step": 7408, + "text_loss": 0.2015499323606491 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00023117256501128136, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11950077.0, + "repeat_count": 1.0, + "routers_loss": 0.005140089895576239, + "skip_count": 2.0, + "step": 7410, + "text_loss": 0.39068636298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00023091164303665592, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11953800.0, + "repeat_count": 0.0, + "routers_loss": 0.005578748416155577, + "skip_count": 0.0, + "step": 7412, + "text_loss": 0.18851874768733978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.00023065082417719624, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11956383.0, + "repeat_count": 0.0, + "routers_loss": 0.0006410991190932691, + "skip_count": 0.0, + "step": 7414, + "text_loss": 0.5663703083992004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0002303901085328491, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11959554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005902954144403338, + "skip_count": 5.0, + "step": 7416, + "text_loss": 0.5225661993026733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0002301294962035209, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11962582.0, + "repeat_count": 0.0, + "routers_loss": 0.00045644037891179323, + "skip_count": 0.0, + "step": 7418, + "text_loss": 0.40572360157966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0002298689872890789, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11965649.0, + "repeat_count": 0.0, + "routers_loss": 0.01017778366804123, + "skip_count": 2.0, + "step": 7420, + "text_loss": 0.12190715968608856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00022960858188935052, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11968850.0, + "repeat_count": 0.0, + "routers_loss": 0.0008010792662389576, + "skip_count": 0.0, + "step": 7422, + "text_loss": 0.5606820583343506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0002293482801041236, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11972064.0, + "repeat_count": 0.0, + "routers_loss": 0.001889281440526247, + "skip_count": 0.0, + "step": 7424, + "text_loss": 0.44142210483551025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00022908808203314635, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11975466.0, + "repeat_count": 0.0, + "routers_loss": 0.00647713290527463, + "skip_count": 2.0, + "step": 7426, + "text_loss": 0.23273423314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002288279877761271, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 11979875.0, + "repeat_count": 0.0, + "routers_loss": 0.004027119372040033, + "skip_count": 0.0, + "step": 7428, + "text_loss": 0.5608086585998535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0002285679974327345, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11982808.0, + "repeat_count": 0.0, + "routers_loss": 0.0009015435934998095, + "skip_count": 0.0, + "step": 7430, + "text_loss": 0.3976539373397827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002283081111025973, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11985978.0, + "repeat_count": 0.0, + "routers_loss": 0.00047143330448307097, + "skip_count": 0.0, + "step": 7432, + "text_loss": 0.4280148446559906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022804832888530447, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11988925.0, + "repeat_count": 0.0, + "routers_loss": 0.0004895820748060942, + "skip_count": 0.0, + "step": 7434, + "text_loss": 0.5137463808059692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000227788650880405, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11991631.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349024574272335, + "skip_count": 0.0, + "step": 7436, + "text_loss": 0.4306720197200775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00022752907718740807, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 11995476.0, + "repeat_count": 0.0, + "routers_loss": 0.0038723985198885202, + "skip_count": 0.0, + "step": 7438, + "text_loss": 0.6413722038269043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00022726960790578248, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 11998846.0, + "repeat_count": 1.0, + "routers_loss": 0.004433541093021631, + "skip_count": 0.0, + "step": 7440, + "text_loss": 0.6424159407615662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 34.93924273554447, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002270102431349579, + "loss": 0.0062, + "macro_f1": 0.6289562582969666, + "num_tokens": 12002228.0, + "repeat_count": 0.0, + "routers_loss": 0.023979803547263145, + "skip_count": 6.0, + "step": 7442, + "text_loss": 0.16657918691635132 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 34.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022675098297432307, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12005003.0, + "repeat_count": 3.0, + "routers_loss": 0.005645833443850279, + "skip_count": 1.0, + "step": 7444, + "text_loss": 0.6388722658157349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022649182752322705, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12007657.0, + "repeat_count": 0.0, + "routers_loss": 0.001629356062039733, + "skip_count": 2.0, + "step": 7446, + "text_loss": 0.35670006275177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00022623277688097864, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12010652.0, + "repeat_count": 0.0, + "routers_loss": 0.006375396624207497, + "skip_count": 2.0, + "step": 7448, + "text_loss": 0.24273613095283508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0002259738311468466, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12014042.0, + "repeat_count": 0.0, + "routers_loss": 0.003734540194272995, + "skip_count": 0.0, + "step": 7450, + "text_loss": 0.4262580871582031 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0002257149904200592, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 12016987.0, + "repeat_count": 1.0, + "routers_loss": 0.0027926203329116106, + "skip_count": 1.0, + "step": 7452, + "text_loss": 0.366216778755188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00022545625479980508, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 12021584.0, + "repeat_count": 0.0, + "routers_loss": 0.0008985420572571456, + "skip_count": 0.0, + "step": 7454, + "text_loss": 0.533937394618988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00022519762438523205, + "loss": 0.0029, + "macro_f1": 0.6666666865348816, + "num_tokens": 12024142.0, + "repeat_count": 0.0, + "routers_loss": 0.005394646432250738, + "skip_count": 1.0, + "step": 7456, + "text_loss": 0.2401239275932312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0002249390992754477, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12027262.0, + "repeat_count": 0.0, + "routers_loss": 0.00275063537992537, + "skip_count": 0.0, + "step": 7458, + "text_loss": 0.21824975311756134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00022468067956951944, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12030528.0, + "repeat_count": 0.0, + "routers_loss": 0.0008951274212449789, + "skip_count": 1.0, + "step": 7460, + "text_loss": 0.610903263092041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00022442236536647408, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12033699.0, + "repeat_count": 0.0, + "routers_loss": 0.004062872380018234, + "skip_count": 2.0, + "step": 7462, + "text_loss": 0.26921433210372925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00022416415676529823, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12037402.0, + "repeat_count": 0.0, + "routers_loss": 0.0023089025635272264, + "skip_count": 1.0, + "step": 7464, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00022390605386493756, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12041129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021355501376092434, + "skip_count": 2.0, + "step": 7466, + "text_loss": 0.4265538454055786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00022364805676429816, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 12044356.0, + "repeat_count": 0.0, + "routers_loss": 0.0061582159250974655, + "skip_count": 1.0, + "step": 7468, + "text_loss": 0.12020833045244217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00022339016556224467, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 12047158.0, + "repeat_count": 0.0, + "routers_loss": 0.003753372235223651, + "skip_count": 1.0, + "step": 7470, + "text_loss": 0.6406939625740051 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022313238035760158, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12050149.0, + "repeat_count": 1.0, + "routers_loss": 0.005371729377657175, + "skip_count": 5.0, + "step": 7472, + "text_loss": 0.5184400677680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002228747012491526, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12053560.0, + "repeat_count": 0.0, + "routers_loss": 0.000824139395263046, + "skip_count": 0.0, + "step": 7474, + "text_loss": 0.32644152641296387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002226171283356409, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12056309.0, + "repeat_count": 0.0, + "routers_loss": 0.0044801668263971806, + "skip_count": 1.0, + "step": 7476, + "text_loss": 0.7027081847190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00022235966171576887, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12059191.0, + "repeat_count": 0.0, + "routers_loss": 0.007496353704482317, + "skip_count": 2.0, + "step": 7478, + "text_loss": 0.28705671429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0002221023014881982, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12062365.0, + "repeat_count": 0.0, + "routers_loss": 0.0018641395727172494, + "skip_count": 1.0, + "step": 7480, + "text_loss": 0.715477466583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00022184504775154984, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12065508.0, + "repeat_count": 0.0, + "routers_loss": 0.0005825075786560774, + "skip_count": 0.0, + "step": 7482, + "text_loss": 0.7481293678283691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00022158790060440394, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12068043.0, + "repeat_count": 0.0, + "routers_loss": 0.0028906071092933416, + "skip_count": 0.0, + "step": 7484, + "text_loss": 0.6151962876319885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00022133086014529968, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12070897.0, + "repeat_count": 0.0, + "routers_loss": 0.0030862605199217796, + "skip_count": 1.0, + "step": 7486, + "text_loss": 0.4923575222492218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00022107392647273527, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 12074644.0, + "repeat_count": 0.0, + "routers_loss": 0.0011101154377683997, + "skip_count": 0.0, + "step": 7488, + "text_loss": 0.5217859148979187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00022081709968516867, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12077718.0, + "repeat_count": 0.0, + "routers_loss": 0.004303969442844391, + "skip_count": 0.0, + "step": 7490, + "text_loss": 0.18933317065238953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00022056037988101612, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12080509.0, + "repeat_count": 0.0, + "routers_loss": 0.0019941304344683886, + "skip_count": 1.0, + "step": 7492, + "text_loss": 0.6760565042495728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.00022030376715865313, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12083580.0, + "repeat_count": 0.0, + "routers_loss": 0.0017090907786041498, + "skip_count": 0.0, + "step": 7494, + "text_loss": 0.4140956401824951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002200472616164142, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12086923.0, + "repeat_count": 0.0, + "routers_loss": 0.005131757352501154, + "skip_count": 1.0, + "step": 7496, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00021979086335259269, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12090003.0, + "repeat_count": 0.0, + "routers_loss": 0.0007472267607226968, + "skip_count": 0.0, + "step": 7498, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021953457246544095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12092936.0, + "repeat_count": 0.0, + "routers_loss": 0.0012374494690448046, + "skip_count": 0.0, + "step": 7500, + "text_loss": 0.5170100331306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00021927838905317016, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12096395.0, + "repeat_count": 0.0, + "routers_loss": 0.006784295197576284, + "skip_count": 2.0, + "step": 7502, + "text_loss": 0.340880811214447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00021902231321395017, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12099743.0, + "repeat_count": 0.0, + "routers_loss": 0.0058755455538630486, + "skip_count": 1.0, + "step": 7504, + "text_loss": 0.5299809575080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021876634504590985, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12103121.0, + "repeat_count": 0.0, + "routers_loss": 0.010622406378388405, + "skip_count": 2.0, + "step": 7506, + "text_loss": 0.1817338913679123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00021851048464713662, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12105883.0, + "repeat_count": 0.0, + "routers_loss": 0.004382388666272163, + "skip_count": 3.0, + "step": 7508, + "text_loss": 0.5718557834625244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00021825473211567665, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12108936.0, + "repeat_count": 0.0, + "routers_loss": 0.001638208981603384, + "skip_count": 0.0, + "step": 7510, + "text_loss": 0.4684678316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00021799908754953468, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12112060.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894381997175515, + "skip_count": 2.0, + "step": 7512, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00021774355104667455, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12115636.0, + "repeat_count": 0.0, + "routers_loss": 0.01400370616465807, + "skip_count": 2.0, + "step": 7514, + "text_loss": 0.19512294232845306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021748812270501805, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12119116.0, + "repeat_count": 0.0, + "routers_loss": 0.005261222366243601, + "skip_count": 3.0, + "step": 7516, + "text_loss": 0.17316904664039612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0002172328026224459, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12122070.0, + "repeat_count": 0.0, + "routers_loss": 0.01021486520767212, + "skip_count": 2.0, + "step": 7518, + "text_loss": 0.2777172029018402 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00021697759089679713, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 12125386.0, + "repeat_count": 2.0, + "routers_loss": 0.005217147525399923, + "skip_count": 2.0, + "step": 7520, + "text_loss": 0.49744322896003723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00021672248762586948, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12128753.0, + "repeat_count": 0.0, + "routers_loss": 0.003868246916681528, + "skip_count": 0.0, + "step": 7522, + "text_loss": 0.4209211468696594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.32403874376284, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00021646749290741895, + "loss": 0.009, + "macro_f1": 0.6598639488220215, + "num_tokens": 12132425.0, + "repeat_count": 1.0, + "routers_loss": 0.044205982238054276, + "skip_count": 3.0, + "step": 7524, + "text_loss": 0.4180344343185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00021621260683916005, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12135740.0, + "repeat_count": 0.0, + "routers_loss": 0.0032584366854280233, + "skip_count": 2.0, + "step": 7526, + "text_loss": 0.21219655871391296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00021595782951876552, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12139239.0, + "repeat_count": 0.0, + "routers_loss": 0.002418758114799857, + "skip_count": 2.0, + "step": 7528, + "text_loss": 0.40800613164901733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0002157031610438665, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12142572.0, + "repeat_count": 1.0, + "routers_loss": 0.005265383515506983, + "skip_count": 1.0, + "step": 7530, + "text_loss": 0.7539705634117126 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002154486015120525, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 12145737.0, + "repeat_count": 1.0, + "routers_loss": 0.006648020353168249, + "skip_count": 2.0, + "step": 7532, + "text_loss": 0.7824432253837585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002151941510208712, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 12149376.0, + "repeat_count": 1.0, + "routers_loss": 0.01692759431898594, + "skip_count": 0.0, + "step": 7534, + "text_loss": 0.4476291239261627 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0002149398096678283, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12152191.0, + "repeat_count": 1.0, + "routers_loss": 0.013883143663406372, + "skip_count": 0.0, + "step": 7536, + "text_loss": 0.14996720850467682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021468557755038826, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12155084.0, + "repeat_count": 2.0, + "routers_loss": 0.009390740655362606, + "skip_count": 2.0, + "step": 7538, + "text_loss": 0.23685340583324432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0002144314547659731, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12159366.0, + "repeat_count": 0.0, + "routers_loss": 0.0025363171007484198, + "skip_count": 0.0, + "step": 7540, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021417744141196315, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12162545.0, + "repeat_count": 0.0, + "routers_loss": 0.004230613354593515, + "skip_count": 1.0, + "step": 7542, + "text_loss": 0.24885894358158112 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00021392353758569694, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12165381.0, + "repeat_count": 1.0, + "routers_loss": 0.008058524690568447, + "skip_count": 0.0, + "step": 7544, + "text_loss": 0.15833988785743713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002136697433844707, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12168304.0, + "repeat_count": 0.0, + "routers_loss": 0.0018041770672425628, + "skip_count": 0.0, + "step": 7546, + "text_loss": 0.6046217083930969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00021341605890553894, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12171040.0, + "repeat_count": 1.0, + "routers_loss": 0.008584463968873024, + "skip_count": 2.0, + "step": 7548, + "text_loss": 0.3001522719860077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021316248424611408, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12174702.0, + "repeat_count": 0.0, + "routers_loss": 0.0010506469989195466, + "skip_count": 0.0, + "step": 7550, + "text_loss": 0.2998376488685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.00021290901950336627, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12178388.0, + "repeat_count": 0.0, + "routers_loss": 0.0012753128539770842, + "skip_count": 0.0, + "step": 7552, + "text_loss": 0.8125656843185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00021265566477442384, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12181863.0, + "repeat_count": 0.0, + "routers_loss": 0.004343052394688129, + "skip_count": 2.0, + "step": 7554, + "text_loss": 0.14004671573638916 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00021240242015637268, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12185485.0, + "repeat_count": 1.0, + "routers_loss": 0.0005794052849523723, + "skip_count": 0.0, + "step": 7556, + "text_loss": 0.7116519808769226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.4837100088054, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00021214928574625664, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 12188914.0, + "repeat_count": 1.0, + "routers_loss": 0.01066325418651104, + "skip_count": 0.0, + "step": 7558, + "text_loss": 0.4664429724216461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021189626164107718, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12193042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011769415577873588, + "skip_count": 0.0, + "step": 7560, + "text_loss": 0.672637403011322 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00021164334793779388, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12195675.0, + "repeat_count": 1.0, + "routers_loss": 0.008653911761939526, + "skip_count": 1.0, + "step": 7562, + "text_loss": 0.5301182866096497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00021139054473332357, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 12198638.0, + "repeat_count": 0.0, + "routers_loss": 0.0058176578022539616, + "skip_count": 0.0, + "step": 7564, + "text_loss": 0.1889677792787552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000211137852124541, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12202312.0, + "repeat_count": 0.0, + "routers_loss": 0.0004154018242843449, + "skip_count": 0.0, + "step": 7566, + "text_loss": 0.3610386848449707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00021088527020827848, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12205112.0, + "repeat_count": 0.0, + "routers_loss": 0.0014722816413268447, + "skip_count": 0.0, + "step": 7568, + "text_loss": 0.15214823186397552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0002106327990813257, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12208103.0, + "repeat_count": 0.0, + "routers_loss": 0.0015596678713336587, + "skip_count": 0.0, + "step": 7570, + "text_loss": 0.5034125447273254 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00021038043884043022, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12211208.0, + "repeat_count": 1.0, + "routers_loss": 0.007482443004846573, + "skip_count": 0.0, + "step": 7572, + "text_loss": 0.6760116219520569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00021012818958229696, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 12214463.0, + "repeat_count": 0.0, + "routers_loss": 0.003875598544254899, + "skip_count": 2.0, + "step": 7574, + "text_loss": 0.3278147876262665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00020987605140358824, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12218199.0, + "repeat_count": 0.0, + "routers_loss": 0.007918627932667732, + "skip_count": 2.0, + "step": 7576, + "text_loss": 0.23850615322589874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00020962402440092388, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12221151.0, + "repeat_count": 0.0, + "routers_loss": 0.005424308590590954, + "skip_count": 1.0, + "step": 7578, + "text_loss": 0.5670642256736755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002093721086708812, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12224789.0, + "repeat_count": 1.0, + "routers_loss": 0.0066504343412816525, + "skip_count": 1.0, + "step": 7580, + "text_loss": 0.30404478311538696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00020912030430999452, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12228134.0, + "repeat_count": 1.0, + "routers_loss": 0.008815597742795944, + "skip_count": 0.0, + "step": 7582, + "text_loss": 0.32522889971733093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.60581156442618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05126953125, + "learning_rate": 0.0002088686114147561, + "loss": 0.0098, + "macro_f1": 0.5492662787437439, + "num_tokens": 12231335.0, + "repeat_count": 0.0, + "routers_loss": 0.03785836696624756, + "skip_count": 2.0, + "step": 7584, + "text_loss": 0.6277920603752136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00020861703008161504, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12234619.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183801926672459, + "skip_count": 0.0, + "step": 7586, + "text_loss": 0.38319316506385803 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00020836556040697767, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12237296.0, + "repeat_count": 1.0, + "routers_loss": 0.013077575713396072, + "skip_count": 1.0, + "step": 7588, + "text_loss": 0.297571063041687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00020811420248720769, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12240633.0, + "repeat_count": 0.0, + "routers_loss": 0.002858756808564067, + "skip_count": 0.0, + "step": 7590, + "text_loss": 0.2506035268306732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.000207862956418626, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12244118.0, + "repeat_count": 0.0, + "routers_loss": 0.0032624071463942528, + "skip_count": 1.0, + "step": 7592, + "text_loss": 0.19843827188014984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.00020761182229751045, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 12247367.0, + "repeat_count": 1.0, + "routers_loss": 0.005885142367333174, + "skip_count": 3.0, + "step": 7594, + "text_loss": 0.3347153067588806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 35.66216612855885, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020736080022009602, + "loss": 0.0088, + "macro_f1": 0.9452888369560242, + "num_tokens": 12250487.0, + "repeat_count": 1.0, + "routers_loss": 0.021491389721632004, + "skip_count": 4.0, + "step": 7596, + "text_loss": 0.6777212619781494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.671558555914295, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00020710989028257514, + "loss": 0.0061, + "macro_f1": 0.6595745086669922, + "num_tokens": 12253834.0, + "repeat_count": 1.0, + "routers_loss": 0.014164486899971962, + "skip_count": 4.0, + "step": 7598, + "text_loss": 0.741127610206604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0002068590925810968, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12257289.0, + "repeat_count": 0.0, + "routers_loss": 0.0012773120542988181, + "skip_count": 0.0, + "step": 7600, + "text_loss": 0.5336982607841492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002066084072117672, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12260825.0, + "repeat_count": 0.0, + "routers_loss": 0.013102042488753796, + "skip_count": 2.0, + "step": 7602, + "text_loss": 0.30410775542259216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00020635783427064942, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12264609.0, + "repeat_count": 0.0, + "routers_loss": 0.002602101070806384, + "skip_count": 0.0, + "step": 7604, + "text_loss": 0.29835572838783264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020610737385376348, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12267537.0, + "repeat_count": 0.0, + "routers_loss": 0.0053265830501914024, + "skip_count": 0.0, + "step": 7606, + "text_loss": 0.2095658779144287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00020585702605708628, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12271175.0, + "repeat_count": 0.0, + "routers_loss": 0.000614096992649138, + "skip_count": 0.0, + "step": 7608, + "text_loss": 0.8146751523017883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020560679097655137, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12274067.0, + "repeat_count": 0.0, + "routers_loss": 0.0013201923575252295, + "skip_count": 0.0, + "step": 7610, + "text_loss": 0.40818271040916443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002053566687080497, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12276946.0, + "repeat_count": 0.0, + "routers_loss": 0.004304401110857725, + "skip_count": 1.0, + "step": 7612, + "text_loss": 0.7063660025596619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0002051066593474284, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 12279760.0, + "repeat_count": 0.0, + "routers_loss": 0.0032060579396784306, + "skip_count": 1.0, + "step": 7614, + "text_loss": 0.23671887814998627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00020485676299049154, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12282737.0, + "repeat_count": 0.0, + "routers_loss": 0.005103024188429117, + "skip_count": 2.0, + "step": 7616, + "text_loss": 0.17571020126342773 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00020460697973299986, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 12286290.0, + "repeat_count": 1.0, + "routers_loss": 0.007189507596194744, + "skip_count": 1.0, + "step": 7618, + "text_loss": 0.30872994661331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002043573096706708, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 12289458.0, + "repeat_count": 0.0, + "routers_loss": 0.0010217712260782719, + "skip_count": 0.0, + "step": 7620, + "text_loss": 0.5155487060546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002041077528991784, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12292846.0, + "repeat_count": 0.0, + "routers_loss": 0.0022399788722395897, + "skip_count": 1.0, + "step": 7622, + "text_loss": 0.717949390411377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0002038583095141532, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12295673.0, + "repeat_count": 0.0, + "routers_loss": 0.0018168877577409148, + "skip_count": 0.0, + "step": 7624, + "text_loss": 0.560361385345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00020360897961118246, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12298624.0, + "repeat_count": 0.0, + "routers_loss": 0.0008487844606861472, + "skip_count": 0.0, + "step": 7626, + "text_loss": 0.6391524076461792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00020335976328580984, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 12302136.0, + "repeat_count": 0.0, + "routers_loss": 0.0006127831293269992, + "skip_count": 0.0, + "step": 7628, + "text_loss": 0.5932226777076721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.00020311066063353556, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 12305152.0, + "repeat_count": 0.0, + "routers_loss": 0.0018765819258987904, + "skip_count": 0.0, + "step": 7630, + "text_loss": 0.37831631302833557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020286167174981618, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12307771.0, + "repeat_count": 0.0, + "routers_loss": 0.0025384656619280577, + "skip_count": 0.0, + "step": 7632, + "text_loss": 0.34806445240974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0002026127967300645, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12310921.0, + "repeat_count": 0.0, + "routers_loss": 0.008239032700657845, + "skip_count": 2.0, + "step": 7634, + "text_loss": 0.34859901666641235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00020236403566965027, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12314200.0, + "repeat_count": 0.0, + "routers_loss": 0.0029505928978323936, + "skip_count": 2.0, + "step": 7636, + "text_loss": 0.2647531032562256 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0002021153886638991, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12319221.0, + "repeat_count": 1.0, + "routers_loss": 0.0014016951899975538, + "skip_count": 0.0, + "step": 7638, + "text_loss": 0.42428603768348694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.86879953037863, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04248046875, + "learning_rate": 0.00020186685580809288, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 12322204.0, + "repeat_count": 0.0, + "routers_loss": 0.01761031709611416, + "skip_count": 2.0, + "step": 7640, + "text_loss": 0.25929757952690125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00020161843719746997, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12324750.0, + "repeat_count": 0.0, + "routers_loss": 0.0023674629628658295, + "skip_count": 0.0, + "step": 7642, + "text_loss": 0.567159116268158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0002013701329272248, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12327933.0, + "repeat_count": 0.0, + "routers_loss": 0.004534341394901276, + "skip_count": 0.0, + "step": 7644, + "text_loss": 0.4765215516090393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00020112194309250797, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12330847.0, + "repeat_count": 0.0, + "routers_loss": 0.003144246758893132, + "skip_count": 2.0, + "step": 7646, + "text_loss": 0.39837369322776794 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00020087386778842642, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 12333782.0, + "repeat_count": 1.0, + "routers_loss": 0.008137194439768791, + "skip_count": 1.0, + "step": 7648, + "text_loss": 0.42175763845443726 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00020062590711004296, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 12336837.0, + "repeat_count": 1.0, + "routers_loss": 0.006499455776065588, + "skip_count": 1.0, + "step": 7650, + "text_loss": 0.18695278465747833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00020037806115237667, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12340414.0, + "repeat_count": 0.0, + "routers_loss": 0.001548365456983447, + "skip_count": 0.0, + "step": 7652, + "text_loss": 0.1981094628572464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00020013033001040255, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 12343209.0, + "repeat_count": 0.0, + "routers_loss": 0.008136926218867302, + "skip_count": 2.0, + "step": 7654, + "text_loss": 0.2231602668762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00019988271377905165, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12346158.0, + "repeat_count": 0.0, + "routers_loss": 0.00370375020429492, + "skip_count": 1.0, + "step": 7656, + "text_loss": 0.4809921383857727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00019963521255321077, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 12349279.0, + "repeat_count": 0.0, + "routers_loss": 0.00690054427832365, + "skip_count": 3.0, + "step": 7658, + "text_loss": 0.40473970770835876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001993878264277233, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 12352848.0, + "repeat_count": 1.0, + "routers_loss": 0.004367961548268795, + "skip_count": 1.0, + "step": 7660, + "text_loss": 0.3646799921989441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00019914055549738775, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12356737.0, + "repeat_count": 0.0, + "routers_loss": 0.000662159756757319, + "skip_count": 0.0, + "step": 7662, + "text_loss": 0.3703214228153229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001988933998569589, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12360085.0, + "repeat_count": 0.0, + "routers_loss": 0.0023262565955519676, + "skip_count": 0.0, + "step": 7664, + "text_loss": 0.12910836935043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0001986463596011473, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12363296.0, + "repeat_count": 0.0, + "routers_loss": 0.002686078194528818, + "skip_count": 1.0, + "step": 7666, + "text_loss": 0.39628392457962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00019839943482461914, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12366072.0, + "repeat_count": 0.0, + "routers_loss": 0.007100159768015146, + "skip_count": 1.0, + "step": 7668, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00019815262562199648, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12368940.0, + "repeat_count": 0.0, + "routers_loss": 0.004194926470518112, + "skip_count": 0.0, + "step": 7670, + "text_loss": 0.36411619186401367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00019790593208785713, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12372031.0, + "repeat_count": 0.0, + "routers_loss": 0.0041313013061881065, + "skip_count": 0.0, + "step": 7672, + "text_loss": 0.23270413279533386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019765935431673444, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12375115.0, + "repeat_count": 1.0, + "routers_loss": 0.003343774238601327, + "skip_count": 0.0, + "step": 7674, + "text_loss": 0.1686355322599411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 36.03756970942178, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019741289240311755, + "loss": 0.0058, + "macro_f1": 0.6122449040412903, + "num_tokens": 12379089.0, + "repeat_count": 0.0, + "routers_loss": 0.021328814327716827, + "skip_count": 4.0, + "step": 7676, + "text_loss": 0.9312577247619629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00019716654644145104, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12383115.0, + "repeat_count": 0.0, + "routers_loss": 0.0004511173174250871, + "skip_count": 0.0, + "step": 7678, + "text_loss": 0.3305695056915283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00019692031652613522, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12386064.0, + "repeat_count": 0.0, + "routers_loss": 0.006190002430230379, + "skip_count": 0.0, + "step": 7680, + "text_loss": 0.4829687178134918 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00019667420275152575, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 12389743.0, + "repeat_count": 2.0, + "routers_loss": 0.004575030412524939, + "skip_count": 1.0, + "step": 7682, + "text_loss": 0.5751548409461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001964282052119341, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12392481.0, + "repeat_count": 0.0, + "routers_loss": 0.002718796720728278, + "skip_count": 0.0, + "step": 7684, + "text_loss": 0.5349925756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001961823240016269, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 12395207.0, + "repeat_count": 0.0, + "routers_loss": 0.0027528523933142424, + "skip_count": 0.0, + "step": 7686, + "text_loss": 0.5322592258453369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00019593655921482624, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12398232.0, + "repeat_count": 1.0, + "routers_loss": 0.008105970919132233, + "skip_count": 0.0, + "step": 7688, + "text_loss": 0.3192061185836792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.10331670090989, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019569091094570967, + "loss": 0.0069, + "macro_f1": 0.6603773832321167, + "num_tokens": 12400862.0, + "repeat_count": 1.0, + "routers_loss": 0.024075545370578766, + "skip_count": 1.0, + "step": 7690, + "text_loss": 0.3189752697944641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001954453792884101, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12404039.0, + "repeat_count": 0.0, + "routers_loss": 0.007513802964240313, + "skip_count": 3.0, + "step": 7692, + "text_loss": 0.5985093712806702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001951999643370157, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12407085.0, + "repeat_count": 1.0, + "routers_loss": 0.009606506675481796, + "skip_count": 2.0, + "step": 7694, + "text_loss": 0.2050790935754776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00019495466618556996, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12411377.0, + "repeat_count": 0.0, + "routers_loss": 0.0007978329667821527, + "skip_count": 0.0, + "step": 7696, + "text_loss": 0.4705570638179779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019470948492807154, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12414427.0, + "repeat_count": 0.0, + "routers_loss": 0.0010737364646047354, + "skip_count": 0.0, + "step": 7698, + "text_loss": 0.6105324029922485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019446442065847448, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12417442.0, + "repeat_count": 0.0, + "routers_loss": 0.001762967323884368, + "skip_count": 0.0, + "step": 7700, + "text_loss": 0.5638618469238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00019421947347068774, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12420862.0, + "repeat_count": 0.0, + "routers_loss": 0.0015798417152836919, + "skip_count": 0.0, + "step": 7702, + "text_loss": 0.1939864307641983 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00019397464345857562, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12423876.0, + "repeat_count": 0.0, + "routers_loss": 0.005659835878759623, + "skip_count": 1.0, + "step": 7704, + "text_loss": 0.20829300582408905 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.00019372993071595723, + "loss": 0.0072, + "macro_f1": 0.9449735879898071, + "num_tokens": 12427639.0, + "repeat_count": 4.0, + "routers_loss": 0.018665846437215805, + "skip_count": 2.0, + "step": 7706, + "text_loss": 0.47913849353790283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00019348533533660727, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 12431520.0, + "repeat_count": 0.0, + "routers_loss": 0.0006690093432553113, + "skip_count": 0.0, + "step": 7708, + "text_loss": 0.494870662689209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00019324085741425511, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 12434213.0, + "repeat_count": 0.0, + "routers_loss": 0.004067352041602135, + "skip_count": 1.0, + "step": 7710, + "text_loss": 0.7631711959838867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00019299649704258504, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12437437.0, + "repeat_count": 2.0, + "routers_loss": 0.01157623715698719, + "skip_count": 0.0, + "step": 7712, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0001927522543152364, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12440507.0, + "repeat_count": 0.0, + "routers_loss": 0.001888492377474904, + "skip_count": 0.0, + "step": 7714, + "text_loss": 0.576301097869873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019250812932580352, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 12443484.0, + "repeat_count": 0.0, + "routers_loss": 0.00042988534551113844, + "skip_count": 0.0, + "step": 7716, + "text_loss": 0.5716445446014404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00019226412216783557, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12446460.0, + "repeat_count": 0.0, + "routers_loss": 0.005063199903815985, + "skip_count": 1.0, + "step": 7718, + "text_loss": 0.2700924873352051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001920202329348365, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 12449346.0, + "repeat_count": 0.0, + "routers_loss": 0.0010775640839710832, + "skip_count": 0.0, + "step": 7720, + "text_loss": 0.5162558555603027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00019177646172026513, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12452680.0, + "repeat_count": 0.0, + "routers_loss": 0.0014514096546918154, + "skip_count": 0.0, + "step": 7722, + "text_loss": 0.5753642916679382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00019153280861753497, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12455348.0, + "repeat_count": 0.0, + "routers_loss": 0.002202774863690138, + "skip_count": 1.0, + "step": 7724, + "text_loss": 0.5751997232437134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00019128927372001454, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 12458098.0, + "repeat_count": 0.0, + "routers_loss": 0.005171069409698248, + "skip_count": 0.0, + "step": 7726, + "text_loss": 0.22252975404262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019104585712102678, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12460958.0, + "repeat_count": 0.0, + "routers_loss": 0.0041033923625946045, + "skip_count": 0.0, + "step": 7728, + "text_loss": 0.18611937761306763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00019080255891384945, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12463596.0, + "repeat_count": 1.0, + "routers_loss": 0.0012201941572129726, + "skip_count": 0.0, + "step": 7730, + "text_loss": 0.47347909212112427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001905593791917148, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 12467021.0, + "repeat_count": 2.0, + "routers_loss": 0.005837214644998312, + "skip_count": 2.0, + "step": 7732, + "text_loss": 0.2055564969778061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00019031631804780974, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12469743.0, + "repeat_count": 0.0, + "routers_loss": 0.0010269953636452556, + "skip_count": 0.0, + "step": 7734, + "text_loss": 0.45995602011680603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00019007337557527582, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12473082.0, + "repeat_count": 0.0, + "routers_loss": 0.00436213007196784, + "skip_count": 1.0, + "step": 7736, + "text_loss": 0.4515823721885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00018983055186720888, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 12476100.0, + "repeat_count": 0.0, + "routers_loss": 0.003051829058676958, + "skip_count": 2.0, + "step": 7738, + "text_loss": 0.12298467755317688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0001895878470166597, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 12480231.0, + "repeat_count": 0.0, + "routers_loss": 0.008164191618561745, + "skip_count": 2.0, + "step": 7740, + "text_loss": 0.17456457018852234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.347519812151454, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.00018934526111663314, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12483894.0, + "repeat_count": 0.0, + "routers_loss": 0.008653721772134304, + "skip_count": 1.0, + "step": 7742, + "text_loss": 0.7125775814056396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00018910279426008857, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12488077.0, + "repeat_count": 0.0, + "routers_loss": 0.005024447571486235, + "skip_count": 6.0, + "step": 7744, + "text_loss": 0.833778977394104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00018886044653993966, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12490999.0, + "repeat_count": 0.0, + "routers_loss": 0.002690888475626707, + "skip_count": 0.0, + "step": 7746, + "text_loss": 0.15594039857387543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00018861821804905466, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12494765.0, + "repeat_count": 0.0, + "routers_loss": 0.006087568122893572, + "skip_count": 0.0, + "step": 7748, + "text_loss": 0.2696777880191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00018837610888025586, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12497741.0, + "repeat_count": 0.0, + "routers_loss": 0.0014629303477704525, + "skip_count": 0.0, + "step": 7750, + "text_loss": 0.6801294684410095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.00018813411912631996, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12500585.0, + "repeat_count": 0.0, + "routers_loss": 0.001163579523563385, + "skip_count": 0.0, + "step": 7752, + "text_loss": 0.41069695353507996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00018789224887997796, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12503579.0, + "repeat_count": 2.0, + "routers_loss": 0.009436148218810558, + "skip_count": 0.0, + "step": 7754, + "text_loss": 0.6993107795715332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00018765049823391472, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 12506698.0, + "repeat_count": 1.0, + "routers_loss": 0.002098206663504243, + "skip_count": 2.0, + "step": 7756, + "text_loss": 0.5704247951507568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018740886728077, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12509869.0, + "repeat_count": 0.0, + "routers_loss": 0.002066673245280981, + "skip_count": 1.0, + "step": 7758, + "text_loss": 0.7605635523796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00018716735611313707, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 12513433.0, + "repeat_count": 0.0, + "routers_loss": 0.0023439819924533367, + "skip_count": 1.0, + "step": 7760, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.441444085705896, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00018692596482356333, + "loss": 0.0057, + "macro_f1": 0.9255813956260681, + "num_tokens": 12516817.0, + "repeat_count": 3.0, + "routers_loss": 0.039019811898469925, + "skip_count": 4.0, + "step": 7762, + "text_loss": 0.3105330467224121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00018668469350455048, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12519357.0, + "repeat_count": 0.0, + "routers_loss": 0.002269966993480921, + "skip_count": 0.0, + "step": 7764, + "text_loss": 0.3700210452079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00018644354224855414, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12522072.0, + "repeat_count": 0.0, + "routers_loss": 0.001265842467546463, + "skip_count": 0.0, + "step": 7766, + "text_loss": 0.6737633943557739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00018620251114798386, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12524999.0, + "repeat_count": 0.0, + "routers_loss": 0.006547329016029835, + "skip_count": 1.0, + "step": 7768, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001859616002952033, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12527785.0, + "repeat_count": 2.0, + "routers_loss": 0.010791841894388199, + "skip_count": 3.0, + "step": 7770, + "text_loss": 0.3069820702075958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0001857208097825299, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12530801.0, + "repeat_count": 0.0, + "routers_loss": 0.00492103723809123, + "skip_count": 2.0, + "step": 7772, + "text_loss": 0.2524295151233673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001854801397022351, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12533919.0, + "repeat_count": 0.0, + "routers_loss": 0.001942967064678669, + "skip_count": 0.0, + "step": 7774, + "text_loss": 0.7855241894721985 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018523959014654407, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12537265.0, + "repeat_count": 2.0, + "routers_loss": 0.00987488217651844, + "skip_count": 2.0, + "step": 7776, + "text_loss": 0.2767317593097687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00018499916120763582, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12539695.0, + "repeat_count": 0.0, + "routers_loss": 0.0054283770732581615, + "skip_count": 1.0, + "step": 7778, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00018475885297764306, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12542881.0, + "repeat_count": 2.0, + "routers_loss": 0.00797359924763441, + "skip_count": 0.0, + "step": 7780, + "text_loss": 0.3738224506378174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0001845186655486527, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12546530.0, + "repeat_count": 0.0, + "routers_loss": 0.0045951665379107, + "skip_count": 0.0, + "step": 7782, + "text_loss": 0.2511517107486725 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 36.54476078661579, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00018427859901270482, + "loss": 0.0055, + "macro_f1": 0.9452888369560242, + "num_tokens": 12549439.0, + "repeat_count": 1.0, + "routers_loss": 0.02312052994966507, + "skip_count": 4.0, + "step": 7784, + "text_loss": 0.3837030827999115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 36.55415321397123, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.059814453125, + "learning_rate": 0.00018403865346179344, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 12553211.0, + "repeat_count": 1.0, + "routers_loss": 0.014698561280965805, + "skip_count": 3.0, + "step": 7786, + "text_loss": 0.510159432888031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.563545641326684, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00018379882898786603, + "loss": 0.0075, + "macro_f1": 0.8803418874740601, + "num_tokens": 12556497.0, + "repeat_count": 2.0, + "routers_loss": 0.023926246911287308, + "skip_count": 7.0, + "step": 7788, + "text_loss": 0.44811317324638367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00018355912568282384, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12559778.0, + "repeat_count": 0.0, + "routers_loss": 0.0011187797645106912, + "skip_count": 0.0, + "step": 7790, + "text_loss": 0.32099616527557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018331954363852166, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12562610.0, + "repeat_count": 0.0, + "routers_loss": 0.0005356677575036883, + "skip_count": 0.0, + "step": 7792, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001830800829467677, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12565886.0, + "repeat_count": 2.0, + "routers_loss": 0.0017101728590205312, + "skip_count": 0.0, + "step": 7794, + "text_loss": 0.4234761595726013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00018284074369932386, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12568728.0, + "repeat_count": 0.0, + "routers_loss": 0.0012841494753956795, + "skip_count": 0.0, + "step": 7796, + "text_loss": 0.41109147667884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001826015259879053, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12572231.0, + "repeat_count": 0.0, + "routers_loss": 0.0022388407960534096, + "skip_count": 0.0, + "step": 7798, + "text_loss": 0.5459926128387451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00018236242990418074, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12574968.0, + "repeat_count": 0.0, + "routers_loss": 0.0019992550369352102, + "skip_count": 0.0, + "step": 7800, + "text_loss": 0.5028481483459473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001821234555397722, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12579074.0, + "repeat_count": 0.0, + "routers_loss": 0.002936388598755002, + "skip_count": 2.0, + "step": 7802, + "text_loss": 0.2377086579799652 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00018188460298625503, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12581912.0, + "repeat_count": 1.0, + "routers_loss": 0.0026762608904391527, + "skip_count": 0.0, + "step": 7804, + "text_loss": 0.13887254893779755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 36.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00018164587233515824, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 12585020.0, + "repeat_count": 3.0, + "routers_loss": 0.003901638789102435, + "skip_count": 1.0, + "step": 7806, + "text_loss": 0.35454171895980835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00018140726367796373, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12588310.0, + "repeat_count": 0.0, + "routers_loss": 0.0031358697451651096, + "skip_count": 2.0, + "step": 7808, + "text_loss": 0.3567306697368622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00018116877710610673, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12591735.0, + "repeat_count": 0.0, + "routers_loss": 0.002310588024556637, + "skip_count": 1.0, + "step": 7810, + "text_loss": 0.45357072353363037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00018093041271097582, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12595232.0, + "repeat_count": 0.0, + "routers_loss": 0.005600228440016508, + "skip_count": 2.0, + "step": 7812, + "text_loss": 0.4179847836494446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.685647196947464, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00018069217058391267, + "loss": 0.006, + "macro_f1": 0.6603773832321167, + "num_tokens": 12598367.0, + "repeat_count": 1.0, + "routers_loss": 0.04015933722257614, + "skip_count": 1.0, + "step": 7814, + "text_loss": 0.17874565720558167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00018045405081621214, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12601864.0, + "repeat_count": 0.0, + "routers_loss": 0.005119446665048599, + "skip_count": 1.0, + "step": 7816, + "text_loss": 0.6867854595184326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00018021605349912207, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12605268.0, + "repeat_count": 0.0, + "routers_loss": 0.0005990012432448566, + "skip_count": 0.0, + "step": 7818, + "text_loss": 0.9084970355033875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00017997817872384358, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12608093.0, + "repeat_count": 0.0, + "routers_loss": 0.008712377399206161, + "skip_count": 1.0, + "step": 7820, + "text_loss": 0.19413328170776367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00017974042658153066, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12611001.0, + "repeat_count": 0.0, + "routers_loss": 0.007535711396485567, + "skip_count": 1.0, + "step": 7822, + "text_loss": 0.2672932744026184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001795027971632905, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12614584.0, + "repeat_count": 1.0, + "routers_loss": 0.006770546548068523, + "skip_count": 3.0, + "step": 7824, + "text_loss": 0.22805163264274597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00017926529056018297, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12617519.0, + "repeat_count": 0.0, + "routers_loss": 0.0010458873584866524, + "skip_count": 0.0, + "step": 7826, + "text_loss": 0.385499507188797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00017902790686322102, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12621566.0, + "repeat_count": 1.0, + "routers_loss": 0.00634258147329092, + "skip_count": 0.0, + "step": 7828, + "text_loss": 0.8044118285179138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00017879064616337076, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12624751.0, + "repeat_count": 0.0, + "routers_loss": 0.0053052278235554695, + "skip_count": 3.0, + "step": 7830, + "text_loss": 0.264322966337204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00017855350855155088, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12628478.0, + "repeat_count": 0.0, + "routers_loss": 0.0028291696216911077, + "skip_count": 0.0, + "step": 7832, + "text_loss": 0.20611460506916046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00017831649411863287, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12632027.0, + "repeat_count": 0.0, + "routers_loss": 0.0009586421074345708, + "skip_count": 1.0, + "step": 7834, + "text_loss": 0.4119716286659241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00017807960295544118, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12635144.0, + "repeat_count": 0.0, + "routers_loss": 0.012304541654884815, + "skip_count": 2.0, + "step": 7836, + "text_loss": 0.28647977113723755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001778428351527529, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12638719.0, + "repeat_count": 0.0, + "routers_loss": 0.005212076939642429, + "skip_count": 2.0, + "step": 7838, + "text_loss": 0.630459189414978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0001776061908012979, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12642119.0, + "repeat_count": 0.0, + "routers_loss": 0.00183707510586828, + "skip_count": 0.0, + "step": 7840, + "text_loss": 0.5905961990356445 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001773696699917588, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12645077.0, + "repeat_count": 1.0, + "routers_loss": 0.0058263009414076805, + "skip_count": 0.0, + "step": 7842, + "text_loss": 0.41949576139450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00017713327281477077, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12648964.0, + "repeat_count": 0.0, + "routers_loss": 0.001586507773026824, + "skip_count": 0.0, + "step": 7844, + "text_loss": 0.5048848390579224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00017689699936092163, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 12651934.0, + "repeat_count": 0.0, + "routers_loss": 0.002397194504737854, + "skip_count": 0.0, + "step": 7846, + "text_loss": 0.23879878222942352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.84531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001766608497207518, + "loss": 0.0054, + "macro_f1": 0.5492662787437439, + "num_tokens": 12654907.0, + "repeat_count": 0.0, + "routers_loss": 0.016742069274187088, + "skip_count": 2.0, + "step": 7848, + "text_loss": 0.23400072753429413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001764248239847544, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 12658765.0, + "repeat_count": 0.0, + "routers_loss": 0.007037387229502201, + "skip_count": 2.0, + "step": 7850, + "text_loss": 0.26165497303009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.017822265625, + "learning_rate": 0.00017618892224337463, + "loss": 0.0044, + "macro_f1": 0.5492662787437439, + "num_tokens": 12662024.0, + "repeat_count": 0.0, + "routers_loss": 0.017352160066366196, + "skip_count": 2.0, + "step": 7852, + "text_loss": 0.23813043534755707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017595314458701084, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12665751.0, + "repeat_count": 0.0, + "routers_loss": 0.005349365528672934, + "skip_count": 3.0, + "step": 7854, + "text_loss": 0.14920757710933685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00017571749110601337, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12668823.0, + "repeat_count": 0.0, + "routers_loss": 0.0037689812015742064, + "skip_count": 2.0, + "step": 7856, + "text_loss": 0.2198697030544281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017548196189068506, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12672367.0, + "repeat_count": 0.0, + "routers_loss": 0.0006363615393638611, + "skip_count": 0.0, + "step": 7858, + "text_loss": 0.5338839888572693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00017524655703128112, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12675217.0, + "repeat_count": 0.0, + "routers_loss": 0.002691479865461588, + "skip_count": 0.0, + "step": 7860, + "text_loss": 0.17463763058185577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00017501127661800908, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12678796.0, + "repeat_count": 0.0, + "routers_loss": 0.002262329449877143, + "skip_count": 0.0, + "step": 7862, + "text_loss": 0.4637797474861145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00017477612074102899, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12681631.0, + "repeat_count": 0.0, + "routers_loss": 0.00115531450137496, + "skip_count": 0.0, + "step": 7864, + "text_loss": 0.6089238524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00017454108949045295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12685647.0, + "repeat_count": 0.0, + "routers_loss": 0.00260268640704453, + "skip_count": 0.0, + "step": 7866, + "text_loss": 0.5876018404960632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00017430618295634514, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12688995.0, + "repeat_count": 0.0, + "routers_loss": 0.002731681102886796, + "skip_count": 0.0, + "step": 7868, + "text_loss": 0.35076001286506653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00017407140122872262, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12692100.0, + "repeat_count": 1.0, + "routers_loss": 0.003314645728096366, + "skip_count": 1.0, + "step": 7870, + "text_loss": 0.5313478112220764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.958027590255355, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017383674439755393, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12695117.0, + "repeat_count": 0.0, + "routers_loss": 0.010385016910731792, + "skip_count": 1.0, + "step": 7872, + "text_loss": 0.5092368125915527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00017360221255276016, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12697678.0, + "repeat_count": 0.0, + "routers_loss": 0.001273582922294736, + "skip_count": 0.0, + "step": 7874, + "text_loss": 0.5282881855964661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00017336780578421418, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12702132.0, + "repeat_count": 0.0, + "routers_loss": 0.0007510313298553228, + "skip_count": 0.0, + "step": 7876, + "text_loss": 0.49093571305274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001731335241817412, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12705413.0, + "repeat_count": 0.0, + "routers_loss": 0.005138787440955639, + "skip_count": 2.0, + "step": 7878, + "text_loss": 0.7503541111946106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001728993678351184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12708310.0, + "repeat_count": 2.0, + "routers_loss": 0.004379773512482643, + "skip_count": 0.0, + "step": 7880, + "text_loss": 0.5942456126213074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001726653368340747, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12711043.0, + "repeat_count": 0.0, + "routers_loss": 0.005271450616419315, + "skip_count": 2.0, + "step": 7882, + "text_loss": 0.348360538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00017243143126829163, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12714473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015764752170071006, + "skip_count": 1.0, + "step": 7884, + "text_loss": 0.45971861481666565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.000172197651227402, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12717832.0, + "repeat_count": 0.0, + "routers_loss": 0.00040649910806678236, + "skip_count": 0.0, + "step": 7886, + "text_loss": 0.5996841788291931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00017196399680099078, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12720479.0, + "repeat_count": 0.0, + "routers_loss": 0.00473182974383235, + "skip_count": 2.0, + "step": 7888, + "text_loss": 0.40346208214759827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00017173046807859483, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12723104.0, + "repeat_count": 0.0, + "routers_loss": 0.0020138369873166084, + "skip_count": 0.0, + "step": 7890, + "text_loss": 0.6878634095191956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.05165835045494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001714970651497027, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 12725967.0, + "repeat_count": 0.0, + "routers_loss": 0.008381367661058903, + "skip_count": 1.0, + "step": 7892, + "text_loss": 0.9161711931228638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00017126378810375498, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 12728819.0, + "repeat_count": 1.0, + "routers_loss": 0.0037658829241991043, + "skip_count": 0.0, + "step": 7894, + "text_loss": 0.4447716772556305 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00017103063703014372, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12731806.0, + "repeat_count": 0.0, + "routers_loss": 0.0022742559667676687, + "skip_count": 0.0, + "step": 7896, + "text_loss": 0.9140825867652893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00017079761201821298, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 12734649.0, + "repeat_count": 0.0, + "routers_loss": 0.002157264854758978, + "skip_count": 0.0, + "step": 7898, + "text_loss": 0.268303781747818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001705647131572583, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12737889.0, + "repeat_count": 1.0, + "routers_loss": 0.01064873393625021, + "skip_count": 1.0, + "step": 7900, + "text_loss": 0.36009490489959717 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017033194053652685, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12740821.0, + "repeat_count": 1.0, + "routers_loss": 0.0062920586206018925, + "skip_count": 0.0, + "step": 7902, + "text_loss": 0.5301805138587952 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.00017009929424521782, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 12743876.0, + "repeat_count": 1.0, + "routers_loss": 0.0033694824669510126, + "skip_count": 1.0, + "step": 7904, + "text_loss": 1.026949167251587 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.117405341943055, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00016986677437248155, + "loss": 0.0071, + "macro_f1": 0.8817967176437378, + "num_tokens": 12747623.0, + "repeat_count": 2.0, + "routers_loss": 0.05076088383793831, + "skip_count": 3.0, + "step": 7906, + "text_loss": 0.33465588092803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016963438100742014, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12751255.0, + "repeat_count": 0.0, + "routers_loss": 0.0005921403644606471, + "skip_count": 0.0, + "step": 7908, + "text_loss": 0.3498881757259369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00016940211423908713, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 12754297.0, + "repeat_count": 0.0, + "routers_loss": 0.004132566973567009, + "skip_count": 0.0, + "step": 7910, + "text_loss": 0.2874198853969574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0001691699741564876, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12756969.0, + "repeat_count": 0.0, + "routers_loss": 0.0024724705144762993, + "skip_count": 1.0, + "step": 7912, + "text_loss": 0.10593545436859131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016893796084857806, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12760261.0, + "repeat_count": 0.0, + "routers_loss": 0.002991671208292246, + "skip_count": 0.0, + "step": 7914, + "text_loss": 0.1331545114517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00016870607440426643, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12762971.0, + "repeat_count": 0.0, + "routers_loss": 0.0018167285015806556, + "skip_count": 0.0, + "step": 7916, + "text_loss": 0.496826171875 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00016847431491241207, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12765949.0, + "repeat_count": 1.0, + "routers_loss": 0.0033364067785441875, + "skip_count": 0.0, + "step": 7918, + "text_loss": 0.43522849678993225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001682426824618256, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 12769201.0, + "repeat_count": 0.0, + "routers_loss": 0.001313596498221159, + "skip_count": 0.0, + "step": 7920, + "text_loss": 0.8691539168357849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.19254476078662, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00016801117714126908, + "loss": 0.0108, + "macro_f1": 0.6603773832321167, + "num_tokens": 12773308.0, + "repeat_count": 1.0, + "routers_loss": 0.02579287625849247, + "skip_count": 1.0, + "step": 7922, + "text_loss": 0.275301069021225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00016777979903945568, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 12776166.0, + "repeat_count": 0.0, + "routers_loss": 0.010501758195459843, + "skip_count": 1.0, + "step": 7924, + "text_loss": 0.32124993205070496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001675485482450499, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12779965.0, + "repeat_count": 0.0, + "routers_loss": 0.0063389060087502, + "skip_count": 2.0, + "step": 7926, + "text_loss": 0.2527695894241333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00016731742484666774, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12783019.0, + "repeat_count": 0.0, + "routers_loss": 0.002796935848891735, + "skip_count": 0.0, + "step": 7928, + "text_loss": 0.18767669796943665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001670864289328759, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12786291.0, + "repeat_count": 0.0, + "routers_loss": 0.007973561994731426, + "skip_count": 2.0, + "step": 7930, + "text_loss": 0.29628485441207886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00016685556059219253, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 12789566.0, + "repeat_count": 4.0, + "routers_loss": 0.011405733413994312, + "skip_count": 6.0, + "step": 7932, + "text_loss": 0.16635073721408844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00016662481991308682, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12792533.0, + "repeat_count": 0.0, + "routers_loss": 0.0012368770549073815, + "skip_count": 1.0, + "step": 7934, + "text_loss": 0.4196353852748871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000166394206983979, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12795619.0, + "repeat_count": 0.0, + "routers_loss": 0.0036002211272716522, + "skip_count": 1.0, + "step": 7936, + "text_loss": 0.17559808492660522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00016616372189324035, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12799702.0, + "repeat_count": 1.0, + "routers_loss": 0.0039332108572125435, + "skip_count": 0.0, + "step": 7938, + "text_loss": 0.603410542011261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00016593336472919324, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12802704.0, + "repeat_count": 0.0, + "routers_loss": 0.0008303318754769862, + "skip_count": 0.0, + "step": 7940, + "text_loss": 0.5331749320030212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.28646903434106, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00016570313558011098, + "loss": 0.0058, + "macro_f1": 0.6601307392120361, + "num_tokens": 12805630.0, + "repeat_count": 1.0, + "routers_loss": 0.05092398822307587, + "skip_count": 2.0, + "step": 7942, + "text_loss": 0.17398510873317719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00016547303453421774, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12809065.0, + "repeat_count": 0.0, + "routers_loss": 0.0006886976188980043, + "skip_count": 0.0, + "step": 7944, + "text_loss": 0.3419797718524933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00016524306167968878, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12812641.0, + "repeat_count": 1.0, + "routers_loss": 0.005634502973407507, + "skip_count": 3.0, + "step": 7946, + "text_loss": 0.5877651572227478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00016501321710465005, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12815527.0, + "repeat_count": 0.0, + "routers_loss": 0.0020598487462848425, + "skip_count": 0.0, + "step": 7948, + "text_loss": 0.3558528423309326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001647835008971783, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12819103.0, + "repeat_count": 0.0, + "routers_loss": 0.005946476943790913, + "skip_count": 2.0, + "step": 7950, + "text_loss": 0.5800213813781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00016455391314530154, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12822423.0, + "repeat_count": 0.0, + "routers_loss": 0.010360358282923698, + "skip_count": 2.0, + "step": 7952, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00016432445393699802, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12826180.0, + "repeat_count": 0.0, + "routers_loss": 0.003017681185156107, + "skip_count": 0.0, + "step": 7954, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00016409512336019698, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12829196.0, + "repeat_count": 0.0, + "routers_loss": 0.0008854938205331564, + "skip_count": 0.0, + "step": 7956, + "text_loss": 0.2776578366756439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.00016386592150277834, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 12831983.0, + "repeat_count": 0.0, + "routers_loss": 0.0023990103509277105, + "skip_count": 0.0, + "step": 7958, + "text_loss": 0.46686989068984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 37.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001636368484525727, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 12834889.0, + "repeat_count": 0.0, + "routers_loss": 0.009835032746195793, + "skip_count": 5.0, + "step": 7960, + "text_loss": 0.22224856913089752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016340790429736118, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12837950.0, + "repeat_count": 0.0, + "routers_loss": 0.0018618656322360039, + "skip_count": 0.0, + "step": 7962, + "text_loss": 0.5101882815361023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00016317908912487578, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12840981.0, + "repeat_count": 1.0, + "routers_loss": 0.001275144051760435, + "skip_count": 1.0, + "step": 7964, + "text_loss": 0.40567103028297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016295040302279873, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12844044.0, + "repeat_count": 0.0, + "routers_loss": 0.003117429558187723, + "skip_count": 2.0, + "step": 7966, + "text_loss": 0.6888198852539062 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00016272184607876312, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12847350.0, + "repeat_count": 2.0, + "routers_loss": 0.006585797294974327, + "skip_count": 4.0, + "step": 7968, + "text_loss": 0.19813506305217743 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001624934183803523, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12850285.0, + "repeat_count": 1.0, + "routers_loss": 0.0043576788157224655, + "skip_count": 1.0, + "step": 7970, + "text_loss": 0.6108269691467285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.427355444672735, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00016226512001510024, + "loss": 0.0039, + "macro_f1": 0.5492662787437439, + "num_tokens": 12853993.0, + "repeat_count": 0.0, + "routers_loss": 0.011879517696797848, + "skip_count": 2.0, + "step": 7972, + "text_loss": 0.42478689551353455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00016203695107049117, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 12857022.0, + "repeat_count": 0.0, + "routers_loss": 0.0016375730047002435, + "skip_count": 0.0, + "step": 7974, + "text_loss": 0.5130020976066589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001618089116339601, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12860764.0, + "repeat_count": 0.0, + "routers_loss": 0.0006649247952736914, + "skip_count": 0.0, + "step": 7976, + "text_loss": 1.0629136562347412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.455532726739065, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00016158100179289208, + "loss": 0.0062, + "macro_f1": 0.6603773832321167, + "num_tokens": 12864066.0, + "repeat_count": 1.0, + "routers_loss": 0.03140667825937271, + "skip_count": 1.0, + "step": 7978, + "text_loss": 0.4241345226764679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 37.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001613532216346226, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12867555.0, + "repeat_count": 0.0, + "routers_loss": 0.010257012210786343, + "skip_count": 4.0, + "step": 7980, + "text_loss": 0.6085613369941711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001611255712464374, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 12871415.0, + "repeat_count": 0.0, + "routers_loss": 0.00783725269138813, + "skip_count": 1.0, + "step": 7982, + "text_loss": 0.15661844611167908 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 0.00016089805071557256, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12874195.0, + "repeat_count": 1.0, + "routers_loss": 0.0027650597039610147, + "skip_count": 2.0, + "step": 7984, + "text_loss": 0.4938865005970001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016067066012921439, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 12878084.0, + "repeat_count": 1.0, + "routers_loss": 0.04647083953022957, + "skip_count": 0.0, + "step": 7986, + "text_loss": 0.2973119020462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00016044339957449938, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12881182.0, + "repeat_count": 0.0, + "routers_loss": 0.002192265819758177, + "skip_count": 0.0, + "step": 7988, + "text_loss": 0.2623208165168762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00016021626913851418, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12884028.0, + "repeat_count": 0.0, + "routers_loss": 0.0023096329532563686, + "skip_count": 0.0, + "step": 7990, + "text_loss": 0.3752247989177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015998926890829562, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 12887759.0, + "repeat_count": 0.0, + "routers_loss": 0.03038526326417923, + "skip_count": 1.0, + "step": 7992, + "text_loss": 0.2609226405620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001597623989708306, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12890976.0, + "repeat_count": 0.0, + "routers_loss": 0.0015199477784335613, + "skip_count": 0.0, + "step": 7994, + "text_loss": 0.6512867212295532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00015953565941305615, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12894112.0, + "repeat_count": 0.0, + "routers_loss": 0.0024166766088455915, + "skip_count": 0.0, + "step": 7996, + "text_loss": 0.5539866089820862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0001593090503218591, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12896857.0, + "repeat_count": 1.0, + "routers_loss": 0.005081235896795988, + "skip_count": 2.0, + "step": 7998, + "text_loss": 0.6631022691726685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00015908257178407682, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12900075.0, + "repeat_count": 1.0, + "routers_loss": 0.0024711282458156347, + "skip_count": 0.0, + "step": 8000, + "text_loss": 0.3309785723686218 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.190256153613376e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8000/training_args.bin b/checkpoint-8000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-8000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880 diff --git a/checkpoint-9000/chat_template.jinja b/checkpoint-9000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/checkpoint-9000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-9000/config.json b/checkpoint-9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3552bd1c531626bd125241ad5dfcd7fb677462cd --- /dev/null +++ b/checkpoint-9000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.55.2", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-9000/generation_config.json b/checkpoint-9000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b513e54e3195b917260c9a8a04c9f3683f19de35 --- /dev/null +++ b/checkpoint-9000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.55.2" +} diff --git a/checkpoint-9000/model-00001-of-00002.safetensors b/checkpoint-9000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08a01e1ba553cdcb2222f034a209861d7b54e284 --- /dev/null +++ b/checkpoint-9000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cbd6d16e927a0c5bad54102514e6e18b4a47b3a6eb911e39d678d328d19f55 +size 4965799096 diff --git a/checkpoint-9000/model-00002-of-00002.safetensors b/checkpoint-9000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7333cc42a74d0b5a354c0c15686ac66e00c1248f --- /dev/null +++ b/checkpoint-9000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d2b8b5751f71689f1422482cfee12ae69e2f62f671bc0a2ea938bb95dd5a346 +size 1481790520 diff --git a/checkpoint-9000/model.safetensors.index.json b/checkpoint-9000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..21bb567761d75ade0c0eef6495c450697dd3ff18 --- /dev/null +++ b/checkpoint-9000/model.safetensors.index.json @@ -0,0 +1,374 @@ +{ + "metadata": { + "total_parameters": 3223774292, + "total_size": 6447548584 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.0.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.1.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.10.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.11.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.12.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.13.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.14.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.15.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.16.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.17.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.18.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.19.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.2.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.20.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.21.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.22.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.23.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.24.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.25.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.26.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.27.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.3.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.4.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.5.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.6.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.7.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.8.linear2.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear1.weight": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.bias": "model-00002-of-00002.safetensors", + "model.routers.9.linear2.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-9000/optimizer.pt b/checkpoint-9000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8c63c7969e758e7834df865b6fb04233cf3bbad --- /dev/null +++ b/checkpoint-9000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba10c8a0a40c8075aa9d547f0b108b838b141d27c7970415af94a1602783b98 +size 44191162 diff --git a/checkpoint-9000/rng_state.pth b/checkpoint-9000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b4b686a589dca06ea5efcd35a2ac40c089a9881 --- /dev/null +++ b/checkpoint-9000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:181ce3e2aecd11b7beeabbdbd7d59126216a3702cd0496a6ee8850d8e39aa35b +size 14244 diff --git a/checkpoint-9000/scheduler.pt b/checkpoint-9000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd5a088ed5512c15368458ef82dc1272b72b927 --- /dev/null +++ b/checkpoint-9000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:411953085dfdae8f05d9fb0c8c040757d224be06bbb93644b08ef4b3e4cb5321 +size 1064 diff --git a/checkpoint-9000/special_tokens_map.json b/checkpoint-9000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..165b36bc2293dda9a2fb3c0daf6577d9eba9df7a --- /dev/null +++ b/checkpoint-9000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|finetune_right_pad_id|>" +} diff --git a/checkpoint-9000/tokenizer.json b/checkpoint-9000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-9000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-9000/tokenizer_config.json b/checkpoint-9000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c68051fe3c4d23234a59316bc52d21f6e3a4182c --- /dev/null +++ b/checkpoint-9000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-9000/trainer_state.json b/checkpoint-9000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91c8f619d0b3e4c34f7aa4503260cbd3e6629537 --- /dev/null +++ b/checkpoint-9000/trainer_state.json @@ -0,0 +1,85534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 42.253595538597004, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.009392427355444672, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.25, + "learning_rate": 2e-06, + "loss": 0.4974, + "macro_f1": 0.23255813121795654, + "num_tokens": 3175.0, + "repeat_count": 0.0, + "routers_loss": 0.4339469373226166, + "skip_count": 0.0, + "step": 2, + "text_loss": 0.3330848515033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 23.0, + "epoch": 0.018784854710889344, + "f1_execute": 0.7272726893424988, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 6e-06, + "loss": 0.4988, + "macro_f1": 0.24242423474788666, + "num_tokens": 5816.0, + "repeat_count": 0.0, + "routers_loss": 0.4511934816837311, + "skip_count": 1.0, + "step": 4, + "text_loss": 0.4571273922920227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.02817728206633402, + "f1_execute": 0.6666666865348816, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.234375, + "learning_rate": 1e-05, + "loss": 0.5113, + "macro_f1": 0.222222238779068, + "num_tokens": 9739.0, + "repeat_count": 0.0, + "routers_loss": 0.49306994676589966, + "skip_count": 0.0, + "step": 6, + "text_loss": 0.41060560941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.03756970942177869, + "f1_execute": 0.5641025900840759, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7265625, + "learning_rate": 1.4e-05, + "loss": 0.4766, + "macro_f1": 0.18803420662879944, + "num_tokens": 12869.0, + "repeat_count": 1.0, + "routers_loss": 0.48872503638267517, + "skip_count": 2.0, + "step": 8, + "text_loss": 0.36678561568260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.046962136777223364, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.78125, + "learning_rate": 1.8e-05, + "loss": 0.4806, + "macro_f1": 0.23255813121795654, + "num_tokens": 15845.0, + "repeat_count": 0.0, + "routers_loss": 0.45077216625213623, + "skip_count": 0.0, + "step": 10, + "text_loss": 0.5597779154777527 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 0.05635456413266804, + "f1_execute": 0.7179487347602844, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.20000000298023224, + "grad_norm": 1.5390625, + "learning_rate": 2.2e-05, + "loss": 0.4557, + "macro_f1": 0.40122103691101074, + "num_tokens": 19353.0, + "repeat_count": 2.0, + "routers_loss": 0.4130440056324005, + "skip_count": 3.0, + "step": 12, + "text_loss": 0.2056603729724884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.06574699148811271, + "f1_execute": 0.6976743936538696, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.6e-05, + "loss": 0.5129, + "macro_f1": 0.23255813121795654, + "num_tokens": 22675.0, + "repeat_count": 0.0, + "routers_loss": 0.4582902193069458, + "skip_count": 0.0, + "step": 14, + "text_loss": 0.32989829778671265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 0.07513941884355738, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.0, + "f1_skip": 0.2222222238779068, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 0.4729, + "macro_f1": 0.3017163574695587, + "num_tokens": 26022.0, + "repeat_count": 0.0, + "routers_loss": 0.42910993099212646, + "skip_count": 1.0, + "step": 16, + "text_loss": 0.1353905349969864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.08453184619900206, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4274, + "macro_f1": 0.2518518567085266, + "num_tokens": 29251.0, + "repeat_count": 0.0, + "routers_loss": 0.3990713059902191, + "skip_count": 0.0, + "step": 18, + "text_loss": 0.3806765377521515 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.09392427355444673, + "f1_execute": 0.6829268336296082, + "f1_repeat": 0.2857142984867096, + "f1_skip": 0.0, + "grad_norm": 1.3125, + "learning_rate": 3.8e-05, + "loss": 0.4261, + "macro_f1": 0.3228803873062134, + "num_tokens": 32545.0, + "repeat_count": 1.0, + "routers_loss": 0.40146592259407043, + "skip_count": 0.0, + "step": 20, + "text_loss": 0.25648367404937744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.1033167009098914, + "f1_execute": 0.7272727489471436, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.625, + "learning_rate": 4.2000000000000004e-05, + "loss": 0.404, + "macro_f1": 0.24242424964904785, + "num_tokens": 36560.0, + "repeat_count": 0.0, + "routers_loss": 0.372715026140213, + "skip_count": 0.0, + "step": 22, + "text_loss": 0.2799522578716278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.11270912826533608, + "f1_execute": 0.7555555105209351, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.6e-05, + "loss": 0.4218, + "macro_f1": 0.2518518567085266, + "num_tokens": 39597.0, + "repeat_count": 0.0, + "routers_loss": 0.4504941403865814, + "skip_count": 0.0, + "step": 24, + "text_loss": 0.6635695695877075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.12210155562078075, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 5e-05, + "loss": 0.3886, + "macro_f1": 0.26950353384017944, + "num_tokens": 43080.0, + "repeat_count": 0.0, + "routers_loss": 0.3498791456222534, + "skip_count": 0.0, + "step": 26, + "text_loss": 0.7035041451454163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.13149398297622542, + "f1_execute": 0.8085106015205383, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.34375, + "learning_rate": 5.4e-05, + "loss": 0.3724, + "macro_f1": 0.26950353384017944, + "num_tokens": 46406.0, + "repeat_count": 0.0, + "routers_loss": 0.31265875697135925, + "skip_count": 0.0, + "step": 28, + "text_loss": 0.6388277411460876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.1408864103316701, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 5.800000000000001e-05, + "loss": 0.341, + "macro_f1": 0.2857142686843872, + "num_tokens": 49966.0, + "repeat_count": 0.0, + "routers_loss": 0.3200918138027191, + "skip_count": 2.0, + "step": 30, + "text_loss": 0.17372547090053558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15027883768711475, + "f1_execute": 0.8571428060531616, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.2e-05, + "loss": 0.3207, + "macro_f1": 0.2857142686843872, + "num_tokens": 53378.0, + "repeat_count": 1.0, + "routers_loss": 0.32304447889328003, + "skip_count": 1.0, + "step": 32, + "text_loss": 0.18196581304073334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.15967126504255943, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.46875, + "learning_rate": 6.6e-05, + "loss": 0.3304, + "macro_f1": 0.3006536364555359, + "num_tokens": 56933.0, + "repeat_count": 0.0, + "routers_loss": 0.24814388155937195, + "skip_count": 0.0, + "step": 34, + "text_loss": 0.28823015093803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.16906369239800412, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 7.000000000000001e-05, + "loss": 0.2778, + "macro_f1": 0.3006536066532135, + "num_tokens": 60744.0, + "repeat_count": 1.0, + "routers_loss": 0.22411039471626282, + "skip_count": 0.0, + "step": 36, + "text_loss": 0.5260357856750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.17845611975344877, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.484375, + "learning_rate": 7.4e-05, + "loss": 0.2738, + "macro_f1": 0.2857142984867096, + "num_tokens": 64900.0, + "repeat_count": 0.0, + "routers_loss": 0.44355395436286926, + "skip_count": 0.0, + "step": 38, + "text_loss": 0.5382097363471985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.18784854710889345, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 7.8e-05, + "loss": 0.2137, + "macro_f1": 0.3076923191547394, + "num_tokens": 68000.0, + "repeat_count": 0.0, + "routers_loss": 0.202330082654953, + "skip_count": 0.0, + "step": 40, + "text_loss": 0.5946118831634521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 25.0, + "epoch": 0.19724097446433814, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.78125, + "learning_rate": 8.2e-05, + "loss": 0.21, + "macro_f1": 0.3144654333591461, + "num_tokens": 70529.0, + "repeat_count": 0.0, + "routers_loss": 0.18023855984210968, + "skip_count": 0.0, + "step": 42, + "text_loss": 0.5550904273986816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2066334018197828, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.609375, + "learning_rate": 8.599999999999999e-05, + "loss": 0.1918, + "macro_f1": 0.32098764181137085, + "num_tokens": 73427.0, + "repeat_count": 2.0, + "routers_loss": 0.2101590931415558, + "skip_count": 0.0, + "step": 44, + "text_loss": 0.4636923372745514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.21602582917522747, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.53125, + "learning_rate": 8.999999999999999e-05, + "loss": 0.1881, + "macro_f1": 0.3333333432674408, + "num_tokens": 76472.0, + "repeat_count": 0.0, + "routers_loss": 0.11800424009561539, + "skip_count": 0.0, + "step": 46, + "text_loss": 0.4187001883983612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.22541825653067216, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.953125, + "learning_rate": 9.400000000000001e-05, + "loss": 0.1446, + "macro_f1": 0.3272727429866791, + "num_tokens": 79124.0, + "repeat_count": 1.0, + "routers_loss": 0.11632519960403442, + "skip_count": 0.0, + "step": 48, + "text_loss": 0.2253919243812561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 0.2348106838861168, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.58984375, + "learning_rate": 9.800000000000001e-05, + "loss": 0.1543, + "macro_f1": 0.32098767161369324, + "num_tokens": 81980.0, + "repeat_count": 1.0, + "routers_loss": 0.09669367223978043, + "skip_count": 0.0, + "step": 50, + "text_loss": 0.6053179502487183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2442031112415615, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 0.000102, + "loss": 0.1393, + "macro_f1": 0.32098764181137085, + "num_tokens": 85236.0, + "repeat_count": 0.0, + "routers_loss": 0.12471720576286316, + "skip_count": 0.0, + "step": 52, + "text_loss": 0.6027331948280334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2535955385970062, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.421875, + "learning_rate": 0.000106, + "loss": 0.1473, + "macro_f1": 0.32098764181137085, + "num_tokens": 88238.0, + "repeat_count": 0.0, + "routers_loss": 0.1376056969165802, + "skip_count": 2.0, + "step": 54, + "text_loss": 0.2861751616001129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.26298796595245083, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.00011, + "loss": 0.1082, + "macro_f1": 0.3333333432674408, + "num_tokens": 91056.0, + "repeat_count": 0.0, + "routers_loss": 0.07449393719434738, + "skip_count": 0.0, + "step": 56, + "text_loss": 0.48106974363327026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 0.2723803933078955, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000114, + "loss": 0.1123, + "macro_f1": 0.32098764181137085, + "num_tokens": 94987.0, + "repeat_count": 0.0, + "routers_loss": 0.07064720243215561, + "skip_count": 0.0, + "step": 58, + "text_loss": 0.3554874658584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.2817728206633402, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5390625, + "learning_rate": 0.000118, + "loss": 0.1234, + "macro_f1": 0.32098764181137085, + "num_tokens": 97909.0, + "repeat_count": 0.0, + "routers_loss": 0.16835889220237732, + "skip_count": 2.0, + "step": 60, + "text_loss": 0.5475804805755615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.29116524801878485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000122, + "loss": 0.1224, + "macro_f1": 0.3333333432674408, + "num_tokens": 101043.0, + "repeat_count": 0.0, + "routers_loss": 0.06127442046999931, + "skip_count": 0.0, + "step": 62, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3005576753742295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000126, + "loss": 0.0931, + "macro_f1": 0.3333333432674408, + "num_tokens": 104103.0, + "repeat_count": 0.0, + "routers_loss": 0.047825805842876434, + "skip_count": 0.0, + "step": 64, + "text_loss": 0.5480486750602722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3099501027296742, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013000000000000002, + "loss": 0.1088, + "macro_f1": 0.3006536364555359, + "num_tokens": 107009.0, + "repeat_count": 1.0, + "routers_loss": 0.275174081325531, + "skip_count": 4.0, + "step": 66, + "text_loss": 0.41714492440223694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.31934253008511887, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000134, + "loss": 0.1123, + "macro_f1": 0.3333333432674408, + "num_tokens": 110486.0, + "repeat_count": 0.0, + "routers_loss": 0.029025178402662277, + "skip_count": 0.0, + "step": 68, + "text_loss": 0.6775627732276917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3287349574405635, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.00013800000000000002, + "loss": 0.1049, + "macro_f1": 0.3272727429866791, + "num_tokens": 113878.0, + "repeat_count": 0.0, + "routers_loss": 0.10141710191965103, + "skip_count": 1.0, + "step": 70, + "text_loss": 0.6678873896598816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.33812738479600823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.248046875, + "learning_rate": 0.00014199999999999998, + "loss": 0.1119, + "macro_f1": 0.3272727429866791, + "num_tokens": 116989.0, + "repeat_count": 0.0, + "routers_loss": 0.08002066612243652, + "skip_count": 1.0, + "step": 72, + "text_loss": 0.405692994594574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3475198121514529, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146, + "loss": 0.0944, + "macro_f1": 0.3144654333591461, + "num_tokens": 119883.0, + "repeat_count": 0.0, + "routers_loss": 0.1867009848356247, + "skip_count": 3.0, + "step": 74, + "text_loss": 0.44616150856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.35691223950689754, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00015, + "loss": 0.1003, + "macro_f1": 0.32098764181137085, + "num_tokens": 123325.0, + "repeat_count": 0.0, + "routers_loss": 0.07042168825864792, + "skip_count": 2.0, + "step": 76, + "text_loss": 0.11340200901031494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.36630466686234225, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26171875, + "learning_rate": 0.000154, + "loss": 0.1066, + "macro_f1": 0.32098764181137085, + "num_tokens": 126131.0, + "repeat_count": 0.0, + "routers_loss": 0.11535373330116272, + "skip_count": 2.0, + "step": 78, + "text_loss": 0.3269135355949402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3756970942177869, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.000158, + "loss": 0.0891, + "macro_f1": 0.3272727429866791, + "num_tokens": 130349.0, + "repeat_count": 0.0, + "routers_loss": 0.09497501701116562, + "skip_count": 1.0, + "step": 80, + "text_loss": 0.15273472666740417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.38508952157323156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000162, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 133607.0, + "repeat_count": 0.0, + "routers_loss": 0.030639523640275, + "skip_count": 0.0, + "step": 82, + "text_loss": 0.282884806394577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.3944819489286763, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016600000000000002, + "loss": 0.1254, + "macro_f1": 0.3272727429866791, + "num_tokens": 136694.0, + "repeat_count": 0.0, + "routers_loss": 0.07906441390514374, + "skip_count": 1.0, + "step": 84, + "text_loss": 0.459094375371933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.40387437628412093, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.00017, + "loss": 0.1071, + "macro_f1": 0.3144654333591461, + "num_tokens": 139966.0, + "repeat_count": 1.0, + "routers_loss": 0.1124570444226265, + "skip_count": 2.0, + "step": 86, + "text_loss": 0.29985448718070984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4132668036395656, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.000174, + "loss": 0.1031, + "macro_f1": 0.32098764181137085, + "num_tokens": 142788.0, + "repeat_count": 2.0, + "routers_loss": 0.1966402679681778, + "skip_count": 0.0, + "step": 88, + "text_loss": 0.6435291767120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4226592309950103, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 0.000178, + "loss": 0.0963, + "macro_f1": 0.3333333432674408, + "num_tokens": 146192.0, + "repeat_count": 0.0, + "routers_loss": 0.0325632207095623, + "skip_count": 0.0, + "step": 90, + "text_loss": 0.35170626640319824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.43205165835045495, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.000182, + "loss": 0.1073, + "macro_f1": 0.32098764181137085, + "num_tokens": 149792.0, + "repeat_count": 1.0, + "routers_loss": 0.15115146338939667, + "skip_count": 1.0, + "step": 92, + "text_loss": 0.83159339427948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4414440857058996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000186, + "loss": 0.1073, + "macro_f1": 0.3333333432674408, + "num_tokens": 152766.0, + "repeat_count": 0.0, + "routers_loss": 0.043313540518283844, + "skip_count": 0.0, + "step": 94, + "text_loss": 0.49707934260368347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4508365130613443, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019, + "loss": 0.0947, + "macro_f1": 0.3333333432674408, + "num_tokens": 156112.0, + "repeat_count": 0.0, + "routers_loss": 0.032021280378103256, + "skip_count": 0.0, + "step": 96, + "text_loss": 0.27608928084373474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.46022894041678897, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000194, + "loss": 0.0846, + "macro_f1": 0.3076923191547394, + "num_tokens": 159454.0, + "repeat_count": 2.0, + "routers_loss": 0.24473154544830322, + "skip_count": 2.0, + "step": 98, + "text_loss": 0.6026689410209656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.4696213677722336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.00019800000000000002, + "loss": 0.1028, + "macro_f1": 0.32098764181137085, + "num_tokens": 163661.0, + "repeat_count": 0.0, + "routers_loss": 0.11468276381492615, + "skip_count": 2.0, + "step": 100, + "text_loss": 0.46733155846595764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.47901379512767833, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000202, + "loss": 0.1089, + "macro_f1": 0.3333333432674408, + "num_tokens": 167134.0, + "repeat_count": 0.0, + "routers_loss": 0.021144939586520195, + "skip_count": 0.0, + "step": 102, + "text_loss": 0.6362994909286499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000206, + "loss": 0.0621, + "macro_f1": 0.3272727429866791, + "num_tokens": 170433.0, + "repeat_count": 0.0, + "routers_loss": 0.06594710797071457, + "skip_count": 1.0, + "step": 104, + "text_loss": 0.4515477120876312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.49779864983856764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021, + "loss": 0.0929, + "macro_f1": 0.3333333432674408, + "num_tokens": 173387.0, + "repeat_count": 0.0, + "routers_loss": 0.032923027873039246, + "skip_count": 0.0, + "step": 106, + "text_loss": 0.6638453006744385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5071910771940124, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000214, + "loss": 0.0883, + "macro_f1": 0.3272727429866791, + "num_tokens": 176170.0, + "repeat_count": 1.0, + "routers_loss": 0.08034781366586685, + "skip_count": 0.0, + "step": 108, + "text_loss": 1.186936855316162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000218, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 179877.0, + "repeat_count": 0.0, + "routers_loss": 0.07814185321331024, + "skip_count": 1.0, + "step": 110, + "text_loss": 0.5488709211349487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5259759319049017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000222, + "loss": 0.0946, + "macro_f1": 0.3333333432674408, + "num_tokens": 182726.0, + "repeat_count": 0.0, + "routers_loss": 0.01884695515036583, + "skip_count": 0.0, + "step": 112, + "text_loss": 0.5195863842964172 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5353683592603463, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.00022600000000000002, + "loss": 0.0974, + "macro_f1": 0.32098764181137085, + "num_tokens": 185624.0, + "repeat_count": 0.0, + "routers_loss": 0.09657823294401169, + "skip_count": 2.0, + "step": 114, + "text_loss": 0.43858134746551514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.00023, + "loss": 0.0753, + "macro_f1": 0.3333333432674408, + "num_tokens": 188155.0, + "repeat_count": 0.0, + "routers_loss": 0.01463601179420948, + "skip_count": 0.0, + "step": 116, + "text_loss": 0.392981618642807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5541532139712357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.439453125, + "learning_rate": 0.00023400000000000002, + "loss": 0.0843, + "macro_f1": 0.3333333432674408, + "num_tokens": 190970.0, + "repeat_count": 0.0, + "routers_loss": 0.03859659656882286, + "skip_count": 0.0, + "step": 118, + "text_loss": 0.309179425239563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5635456413266804, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00023799999999999998, + "loss": 0.053, + "macro_f1": 0.3333333432674408, + "num_tokens": 193988.0, + "repeat_count": 0.0, + "routers_loss": 0.019092386588454247, + "skip_count": 0.0, + "step": 120, + "text_loss": 0.48543134331703186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.572938068682125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 0.000242, + "loss": 0.1203, + "macro_f1": 0.3272727429866791, + "num_tokens": 196475.0, + "repeat_count": 0.0, + "routers_loss": 0.0619138665497303, + "skip_count": 1.0, + "step": 122, + "text_loss": 0.4615364074707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5823304960375697, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.000246, + "loss": 0.1002, + "macro_f1": 0.3272727429866791, + "num_tokens": 200045.0, + "repeat_count": 1.0, + "routers_loss": 0.09752107411623001, + "skip_count": 0.0, + "step": 124, + "text_loss": 0.15802054107189178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.00025, + "loss": 0.0773, + "macro_f1": 0.3333333432674408, + "num_tokens": 203214.0, + "repeat_count": 0.0, + "routers_loss": 0.02896115928888321, + "skip_count": 0.0, + "step": 126, + "text_loss": 0.4543360471725464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.601115350748459, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.4296875, + "learning_rate": 0.000254, + "loss": 0.0973, + "macro_f1": 0.3333333432674408, + "num_tokens": 206168.0, + "repeat_count": 0.0, + "routers_loss": 0.011423567309975624, + "skip_count": 0.0, + "step": 128, + "text_loss": 0.4730179011821747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6105077781039038, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.365234375, + "learning_rate": 0.00025800000000000004, + "loss": 0.099, + "macro_f1": 0.3333333432674408, + "num_tokens": 209907.0, + "repeat_count": 0.0, + "routers_loss": 0.01957600563764572, + "skip_count": 0.0, + "step": 130, + "text_loss": 0.45122358202934265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6199002054593484, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000262, + "loss": 0.0868, + "macro_f1": 0.3272727429866791, + "num_tokens": 213521.0, + "repeat_count": 0.0, + "routers_loss": 0.04882373288273811, + "skip_count": 1.0, + "step": 132, + "text_loss": 0.4341491758823395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6292926328147931, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000266, + "loss": 0.0834, + "macro_f1": 0.3333333432674408, + "num_tokens": 216484.0, + "repeat_count": 0.0, + "routers_loss": 0.016083380207419395, + "skip_count": 0.0, + "step": 134, + "text_loss": 0.46990111470222473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6386850601702377, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.00027, + "loss": 0.0863, + "macro_f1": 0.3333333432674408, + "num_tokens": 219398.0, + "repeat_count": 0.0, + "routers_loss": 0.01733536459505558, + "skip_count": 0.0, + "step": 136, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6480774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.00027400000000000005, + "loss": 0.0997, + "macro_f1": 0.3333333432674408, + "num_tokens": 222430.0, + "repeat_count": 0.0, + "routers_loss": 0.01332803163677454, + "skip_count": 0.0, + "step": 138, + "text_loss": 0.47699397802352905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.657469914881127, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.333984375, + "learning_rate": 0.00027800000000000004, + "loss": 0.0922, + "macro_f1": 0.3144654333591461, + "num_tokens": 225458.0, + "repeat_count": 1.0, + "routers_loss": 0.14924728870391846, + "skip_count": 2.0, + "step": 140, + "text_loss": 0.5858222842216492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6668623422365718, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.00028199999999999997, + "loss": 0.0798, + "macro_f1": 0.3144654333591461, + "num_tokens": 229365.0, + "repeat_count": 1.0, + "routers_loss": 0.1860177218914032, + "skip_count": 2.0, + "step": 142, + "text_loss": 0.5003137588500977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6762547695920165, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.00028599999999999996, + "loss": 0.054, + "macro_f1": 0.32098764181137085, + "num_tokens": 231787.0, + "repeat_count": 1.0, + "routers_loss": 0.16498211026191711, + "skip_count": 1.0, + "step": 144, + "text_loss": 0.5026470422744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6856471969474611, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.00029, + "loss": 0.0936, + "macro_f1": 0.32098764181137085, + "num_tokens": 235014.0, + "repeat_count": 1.0, + "routers_loss": 0.11801310628652573, + "skip_count": 1.0, + "step": 146, + "text_loss": 0.611888587474823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.6950396243029058, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000294, + "loss": 0.0878, + "macro_f1": 0.3333333432674408, + "num_tokens": 238210.0, + "repeat_count": 0.0, + "routers_loss": 0.02422776259481907, + "skip_count": 0.0, + "step": 148, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7044320516583504, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000298, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 241582.0, + "repeat_count": 0.0, + "routers_loss": 0.07282499223947525, + "skip_count": 2.0, + "step": 150, + "text_loss": 0.3919292390346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7138244790137951, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.000302, + "loss": 0.0797, + "macro_f1": 0.32098764181137085, + "num_tokens": 244621.0, + "repeat_count": 1.0, + "routers_loss": 0.20659038424491882, + "skip_count": 1.0, + "step": 152, + "text_loss": 0.4294498860836029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7232169063692399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1787109375, + "learning_rate": 0.000306, + "loss": 0.072, + "macro_f1": 0.3333333432674408, + "num_tokens": 247833.0, + "repeat_count": 0.0, + "routers_loss": 0.02428400330245495, + "skip_count": 0.0, + "step": 154, + "text_loss": 0.5930765867233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7326093337246845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.00031, + "loss": 0.0772, + "macro_f1": 0.3333333432674408, + "num_tokens": 251349.0, + "repeat_count": 0.0, + "routers_loss": 0.0167869683355093, + "skip_count": 0.0, + "step": 156, + "text_loss": 0.41063904762268066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7420017610801292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000314, + "loss": 0.0821, + "macro_f1": 0.3333333432674408, + "num_tokens": 254886.0, + "repeat_count": 0.0, + "routers_loss": 0.02531604655086994, + "skip_count": 0.0, + "step": 158, + "text_loss": 0.6739020347595215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7513941884355738, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.00031800000000000003, + "loss": 0.09, + "macro_f1": 0.3333333432674408, + "num_tokens": 258260.0, + "repeat_count": 0.0, + "routers_loss": 0.017772775143384933, + "skip_count": 0.0, + "step": 160, + "text_loss": 0.46873849630355835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7607866157910185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.000322, + "loss": 0.0893, + "macro_f1": 0.3272727429866791, + "num_tokens": 261846.0, + "repeat_count": 0.0, + "routers_loss": 0.034902360290288925, + "skip_count": 1.0, + "step": 162, + "text_loss": 0.3727971017360687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7701790431464631, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000326, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 264348.0, + "repeat_count": 0.0, + "routers_loss": 0.013553355820477009, + "skip_count": 0.0, + "step": 164, + "text_loss": 0.5798237323760986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7795714705019078, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.00033, + "loss": 0.0926, + "macro_f1": 0.32098764181137085, + "num_tokens": 267479.0, + "repeat_count": 1.0, + "routers_loss": 0.13571743667125702, + "skip_count": 1.0, + "step": 166, + "text_loss": 0.8084776997566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7889638978573525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.00033400000000000004, + "loss": 0.0817, + "macro_f1": 0.32098764181137085, + "num_tokens": 270268.0, + "repeat_count": 2.0, + "routers_loss": 0.19884146749973297, + "skip_count": 0.0, + "step": 168, + "text_loss": 0.7366134524345398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.7983563252127972, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.00033800000000000003, + "loss": 0.1022, + "macro_f1": 0.32098764181137085, + "num_tokens": 273518.0, + "repeat_count": 1.0, + "routers_loss": 0.15469175577163696, + "skip_count": 1.0, + "step": 170, + "text_loss": 0.27204006910324097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8077487525682419, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000342, + "loss": 0.0865, + "macro_f1": 0.32098764181137085, + "num_tokens": 277210.0, + "repeat_count": 0.0, + "routers_loss": 0.08603330701589584, + "skip_count": 2.0, + "step": 172, + "text_loss": 0.7137667536735535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8171411799236865, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000346, + "loss": 0.0902, + "macro_f1": 0.3076923191547394, + "num_tokens": 280389.0, + "repeat_count": 0.0, + "routers_loss": 0.17851492762565613, + "skip_count": 4.0, + "step": 174, + "text_loss": 0.5148105621337891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8265336072791312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035, + "loss": 0.0853, + "macro_f1": 0.3333333432674408, + "num_tokens": 283501.0, + "repeat_count": 0.0, + "routers_loss": 0.021331604570150375, + "skip_count": 0.0, + "step": 176, + "text_loss": 0.301013320684433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8359260346345758, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000354, + "loss": 0.0911, + "macro_f1": 0.32098764181137085, + "num_tokens": 287154.0, + "repeat_count": 0.0, + "routers_loss": 0.057273946702480316, + "skip_count": 2.0, + "step": 178, + "text_loss": 0.4740981459617615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8453184619900206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.240234375, + "learning_rate": 0.000358, + "loss": 0.0904, + "macro_f1": 0.3272727429866791, + "num_tokens": 289929.0, + "repeat_count": 0.0, + "routers_loss": 0.04116598889231682, + "skip_count": 1.0, + "step": 180, + "text_loss": 0.4838573932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8547108893454652, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000362, + "loss": 0.0991, + "macro_f1": 0.3333333432674408, + "num_tokens": 294293.0, + "repeat_count": 0.0, + "routers_loss": 0.027111956849694252, + "skip_count": 0.0, + "step": 182, + "text_loss": 0.7495553493499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8641033167009099, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.000366, + "loss": 0.1038, + "macro_f1": 0.3333333432674408, + "num_tokens": 297730.0, + "repeat_count": 0.0, + "routers_loss": 0.019166452810168266, + "skip_count": 0.0, + "step": 184, + "text_loss": 0.534831166267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 0.8734957440563546, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2236328125, + "learning_rate": 0.00037, + "loss": 0.0784, + "macro_f1": 0.5427350401878357, + "num_tokens": 300593.0, + "repeat_count": 1.0, + "routers_loss": 0.2349659502506256, + "skip_count": 2.0, + "step": 186, + "text_loss": 0.3549048602581024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8828881714117992, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.000374, + "loss": 0.0827, + "macro_f1": 0.3076923191547394, + "num_tokens": 303456.0, + "repeat_count": 2.0, + "routers_loss": 0.22502389550209045, + "skip_count": 2.0, + "step": 188, + "text_loss": 0.8837642073631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.8922805987672439, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000378, + "loss": 0.1085, + "macro_f1": 0.3272727429866791, + "num_tokens": 306241.0, + "repeat_count": 1.0, + "routers_loss": 0.12291611731052399, + "skip_count": 0.0, + "step": 190, + "text_loss": 0.73353511095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9016730261226886, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000382, + "loss": 0.0969, + "macro_f1": 0.3272727429866791, + "num_tokens": 310606.0, + "repeat_count": 0.0, + "routers_loss": 0.055988848209381104, + "skip_count": 1.0, + "step": 192, + "text_loss": 0.6261917352676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9110654534781333, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.34375, + "learning_rate": 0.000386, + "loss": 0.1055, + "macro_f1": 0.3144654333591461, + "num_tokens": 313564.0, + "repeat_count": 0.0, + "routers_loss": 0.12363404780626297, + "skip_count": 3.0, + "step": 194, + "text_loss": 0.2790874242782593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9204578808335779, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00039000000000000005, + "loss": 0.0964, + "macro_f1": 0.3076923191547394, + "num_tokens": 316958.0, + "repeat_count": 2.0, + "routers_loss": 0.2718356251716614, + "skip_count": 2.0, + "step": 196, + "text_loss": 0.14428086578845978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9298503081890226, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.00039400000000000004, + "loss": 0.0917, + "macro_f1": 0.32098764181137085, + "num_tokens": 320103.0, + "repeat_count": 0.0, + "routers_loss": 0.07188102602958679, + "skip_count": 2.0, + "step": 198, + "text_loss": 0.27155816555023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9392427355444672, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.201171875, + "learning_rate": 0.000398, + "loss": 0.0809, + "macro_f1": 0.32098764181137085, + "num_tokens": 323566.0, + "repeat_count": 1.0, + "routers_loss": 0.18038256466388702, + "skip_count": 1.0, + "step": 200, + "text_loss": 0.8453494310379028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9486351628999119, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.000402, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 326385.0, + "repeat_count": 0.0, + "routers_loss": 0.014639763161540031, + "skip_count": 0.0, + "step": 202, + "text_loss": 0.5733131766319275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9580275902553567, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.00040600000000000006, + "loss": 0.104, + "macro_f1": 0.3333333432674408, + "num_tokens": 329266.0, + "repeat_count": 0.0, + "routers_loss": 0.015269627794623375, + "skip_count": 0.0, + "step": 204, + "text_loss": 0.7355639934539795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9674200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.00041, + "loss": 0.0833, + "macro_f1": 0.3333333432674408, + "num_tokens": 332984.0, + "repeat_count": 0.0, + "routers_loss": 0.018046971410512924, + "skip_count": 0.0, + "step": 206, + "text_loss": 0.587641179561615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000414, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 335739.0, + "repeat_count": 1.0, + "routers_loss": 0.12791286408901215, + "skip_count": 0.0, + "step": 208, + "text_loss": 0.6538406610488892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9862048723216906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.00041799999999999997, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 338966.0, + "repeat_count": 0.0, + "routers_loss": 0.050490595400333405, + "skip_count": 1.0, + "step": 210, + "text_loss": 0.4188295602798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 0.9955972996771353, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.271484375, + "learning_rate": 0.000422, + "loss": 0.0588, + "macro_f1": 0.3144654333591461, + "num_tokens": 342063.0, + "repeat_count": 0.0, + "routers_loss": 0.11652113497257233, + "skip_count": 3.0, + "step": 212, + "text_loss": 0.21822240948677063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0046962136777224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.000426, + "loss": 0.0621, + "macro_f1": 0.3333333432674408, + "num_tokens": 344887.0, + "repeat_count": 0.0, + "routers_loss": 0.023898238316178322, + "skip_count": 0.0, + "step": 214, + "text_loss": 0.24692800641059875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.014088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.00043, + "loss": 0.1005, + "macro_f1": 0.3272727429866791, + "num_tokens": 348700.0, + "repeat_count": 1.0, + "routers_loss": 0.06414655596017838, + "skip_count": 0.0, + "step": 216, + "text_loss": 0.4744548797607422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0234810683886117, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.00043400000000000003, + "loss": 0.0753, + "macro_f1": 0.32098764181137085, + "num_tokens": 351507.0, + "repeat_count": 1.0, + "routers_loss": 0.11702914535999298, + "skip_count": 1.0, + "step": 218, + "text_loss": 0.5614864826202393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0328734957440564, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000438, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 354484.0, + "repeat_count": 0.0, + "routers_loss": 0.014991643838584423, + "skip_count": 0.0, + "step": 220, + "text_loss": 0.47209832072257996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.251953125, + "learning_rate": 0.000442, + "loss": 0.106, + "macro_f1": 0.3272727429866791, + "num_tokens": 357954.0, + "repeat_count": 0.0, + "routers_loss": 0.04747112840414047, + "skip_count": 1.0, + "step": 222, + "text_loss": 0.2968728244304657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0516583504549457, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.40234375, + "learning_rate": 0.000446, + "loss": 0.0853, + "macro_f1": 0.32098764181137085, + "num_tokens": 360547.0, + "repeat_count": 0.0, + "routers_loss": 0.06754162162542343, + "skip_count": 2.0, + "step": 224, + "text_loss": 0.2364148646593094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0610507778103904, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.00045000000000000004, + "loss": 0.1016, + "macro_f1": 0.3272727429866791, + "num_tokens": 364529.0, + "repeat_count": 0.0, + "routers_loss": 0.07830183953046799, + "skip_count": 1.0, + "step": 226, + "text_loss": 0.4787476360797882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.00045400000000000003, + "loss": 0.0792, + "macro_f1": 0.3333333432674408, + "num_tokens": 367683.0, + "repeat_count": 0.0, + "routers_loss": 0.015735948458313942, + "skip_count": 0.0, + "step": 228, + "text_loss": 0.37148505449295044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000458, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 371402.0, + "repeat_count": 0.0, + "routers_loss": 0.013354359194636345, + "skip_count": 0.0, + "step": 230, + "text_loss": 0.7464763522148132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.000462, + "loss": 0.0731, + "macro_f1": 0.3333333432674408, + "num_tokens": 374587.0, + "repeat_count": 0.0, + "routers_loss": 0.013763721100986004, + "skip_count": 0.0, + "step": 232, + "text_loss": 0.8754443526268005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3984375, + "learning_rate": 0.00046600000000000005, + "loss": 0.0861, + "macro_f1": 0.3333333432674408, + "num_tokens": 377513.0, + "repeat_count": 0.0, + "routers_loss": 0.010075435042381287, + "skip_count": 0.0, + "step": 234, + "text_loss": 0.31534913182258606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1080129145876136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.00047, + "loss": 0.0791, + "macro_f1": 0.3272727429866791, + "num_tokens": 380736.0, + "repeat_count": 0.0, + "routers_loss": 0.059825167059898376, + "skip_count": 1.0, + "step": 236, + "text_loss": 0.5936337113380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1174053419430585, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000474, + "loss": 0.0514, + "macro_f1": 0.32098764181137085, + "num_tokens": 383236.0, + "repeat_count": 0.0, + "routers_loss": 0.09134846180677414, + "skip_count": 2.0, + "step": 238, + "text_loss": 0.5976157784461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1267977692985032, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.208984375, + "learning_rate": 0.00047799999999999996, + "loss": 0.0858, + "macro_f1": 0.32098764181137085, + "num_tokens": 385778.0, + "repeat_count": 1.0, + "routers_loss": 0.11989791691303253, + "skip_count": 1.0, + "step": 240, + "text_loss": 0.3554210960865021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1361901966539478, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000482, + "loss": 0.0734, + "macro_f1": 0.3333333432674408, + "num_tokens": 388777.0, + "repeat_count": 0.0, + "routers_loss": 0.013591105118393898, + "skip_count": 0.0, + "step": 242, + "text_loss": 0.4829460382461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1455826240093925, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.000486, + "loss": 0.0625, + "macro_f1": 0.32098764181137085, + "num_tokens": 391797.0, + "repeat_count": 0.0, + "routers_loss": 0.0920003354549408, + "skip_count": 2.0, + "step": 244, + "text_loss": 0.3085818886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1549750513648371, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.00049, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 396485.0, + "repeat_count": 0.0, + "routers_loss": 0.0129330949857831, + "skip_count": 0.0, + "step": 246, + "text_loss": 0.42803969979286194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1643674787202818, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.000494, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 399923.0, + "repeat_count": 0.0, + "routers_loss": 0.10677755624055862, + "skip_count": 3.0, + "step": 248, + "text_loss": 0.2908555567264557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1737599060757264, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000498, + "loss": 0.0812, + "macro_f1": 0.3144654333591461, + "num_tokens": 403647.0, + "repeat_count": 0.0, + "routers_loss": 0.1504337340593338, + "skip_count": 3.0, + "step": 250, + "text_loss": 0.333095908164978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.183152333431171, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.22265625, + "learning_rate": 0.0005020000000000001, + "loss": 0.0828, + "macro_f1": 0.32098764181137085, + "num_tokens": 409147.0, + "repeat_count": 0.0, + "routers_loss": 0.06503184884786606, + "skip_count": 2.0, + "step": 252, + "text_loss": 0.16117942333221436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.1925447607866158, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.287109375, + "learning_rate": 0.000506, + "loss": 0.0995, + "macro_f1": 0.3333333432674408, + "num_tokens": 412072.0, + "repeat_count": 0.0, + "routers_loss": 0.016280122101306915, + "skip_count": 0.0, + "step": 254, + "text_loss": 0.4217492640018463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2019371881420604, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.00051, + "loss": 0.0803, + "macro_f1": 0.3144654333591461, + "num_tokens": 415052.0, + "repeat_count": 2.0, + "routers_loss": 0.2117508500814438, + "skip_count": 1.0, + "step": 256, + "text_loss": 0.5795308947563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.211329615497505, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000514, + "loss": 0.0668, + "macro_f1": 0.3272727429866791, + "num_tokens": 418099.0, + "repeat_count": 1.0, + "routers_loss": 0.15002092719078064, + "skip_count": 0.0, + "step": 258, + "text_loss": 0.4840938448905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2207220428529497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.000518, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 422526.0, + "repeat_count": 0.0, + "routers_loss": 0.012834074907004833, + "skip_count": 0.0, + "step": 260, + "text_loss": 0.36141225695610046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2301144702083944, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.000522, + "loss": 0.085, + "macro_f1": 0.3076923191547394, + "num_tokens": 425765.0, + "repeat_count": 2.0, + "routers_loss": 0.23808011412620544, + "skip_count": 2.0, + "step": 262, + "text_loss": 0.27572691440582275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2395068975638392, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000526, + "loss": 0.0708, + "macro_f1": 0.3272727429866791, + "num_tokens": 429048.0, + "repeat_count": 0.0, + "routers_loss": 0.055687375366687775, + "skip_count": 1.0, + "step": 264, + "text_loss": 0.37020301818847656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.248899324919284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005300000000000001, + "loss": 0.0839, + "macro_f1": 0.3272727429866791, + "num_tokens": 431784.0, + "repeat_count": 0.0, + "routers_loss": 0.0872957780957222, + "skip_count": 1.0, + "step": 266, + "text_loss": 0.5937283039093018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2582917522747286, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0005340000000000001, + "loss": 0.0733, + "macro_f1": 0.32098764181137085, + "num_tokens": 434297.0, + "repeat_count": 2.0, + "routers_loss": 0.23507654666900635, + "skip_count": 0.0, + "step": 268, + "text_loss": 0.3367372453212738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2676841796301732, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005380000000000001, + "loss": 0.0708, + "macro_f1": 0.32098764181137085, + "num_tokens": 437586.0, + "repeat_count": 0.0, + "routers_loss": 0.12860390543937683, + "skip_count": 2.0, + "step": 270, + "text_loss": 0.7149854302406311 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2770766069856179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005420000000000001, + "loss": 0.1072, + "macro_f1": 0.3272727429866791, + "num_tokens": 440649.0, + "repeat_count": 0.0, + "routers_loss": 0.044308312237262726, + "skip_count": 1.0, + "step": 272, + "text_loss": 0.26778292655944824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.2864690343410625, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.44921875, + "learning_rate": 0.000546, + "loss": 0.0938, + "macro_f1": 0.3144654333591461, + "num_tokens": 443907.0, + "repeat_count": 0.0, + "routers_loss": 0.11514109373092651, + "skip_count": 3.0, + "step": 274, + "text_loss": 0.23578761518001556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.2958614616965072, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2578125, + "learning_rate": 0.00055, + "loss": 0.0932, + "macro_f1": 0.5492662787437439, + "num_tokens": 447147.0, + "repeat_count": 0.0, + "routers_loss": 0.055705297738313675, + "skip_count": 2.0, + "step": 276, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3052538890519518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000554, + "loss": 0.0667, + "macro_f1": 0.32098764181137085, + "num_tokens": 450032.0, + "repeat_count": 0.0, + "routers_loss": 0.13778971135616302, + "skip_count": 2.0, + "step": 278, + "text_loss": 0.4857243597507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3146463164073965, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.000558, + "loss": 0.0672, + "macro_f1": 0.3272727429866791, + "num_tokens": 453195.0, + "repeat_count": 1.0, + "routers_loss": 0.0700262188911438, + "skip_count": 0.0, + "step": 280, + "text_loss": 0.7589789628982544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3240387437628411, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.0005620000000000001, + "loss": 0.0603, + "macro_f1": 0.3144654333591461, + "num_tokens": 455942.0, + "repeat_count": 1.0, + "routers_loss": 0.11706235259771347, + "skip_count": 2.0, + "step": 282, + "text_loss": 0.4783432185649872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3334311711182858, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.265625, + "learning_rate": 0.000566, + "loss": 0.0793, + "macro_f1": 0.3272727429866791, + "num_tokens": 458932.0, + "repeat_count": 0.0, + "routers_loss": 0.07073967158794403, + "skip_count": 1.0, + "step": 284, + "text_loss": 0.7117193937301636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3428235984737307, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.00057, + "loss": 0.0915, + "macro_f1": 0.3272727429866791, + "num_tokens": 462650.0, + "repeat_count": 0.0, + "routers_loss": 0.05301115661859512, + "skip_count": 1.0, + "step": 286, + "text_loss": 0.4175460636615753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.352216025829175, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000574, + "loss": 0.0675, + "macro_f1": 0.3272727429866791, + "num_tokens": 466290.0, + "repeat_count": 0.0, + "routers_loss": 0.06356479972600937, + "skip_count": 1.0, + "step": 288, + "text_loss": 0.5832946300506592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.36160845318462, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.000578, + "loss": 0.0805, + "macro_f1": 0.3006536066532135, + "num_tokens": 469296.0, + "repeat_count": 1.0, + "routers_loss": 0.21032999455928802, + "skip_count": 3.0, + "step": 290, + "text_loss": 0.36023473739624023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3710008805400646, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.27734375, + "learning_rate": 0.0005819999999999999, + "loss": 0.0685, + "macro_f1": 0.32098764181137085, + "num_tokens": 472272.0, + "repeat_count": 1.0, + "routers_loss": 0.08062280714511871, + "skip_count": 1.0, + "step": 292, + "text_loss": 0.37197956442832947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3803933078955093, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0005859999999999999, + "loss": 0.0878, + "macro_f1": 0.32098764181137085, + "num_tokens": 475864.0, + "repeat_count": 0.0, + "routers_loss": 0.05023600533604622, + "skip_count": 2.0, + "step": 294, + "text_loss": 0.4765273630619049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2177734375, + "learning_rate": 0.00059, + "loss": 0.0728, + "macro_f1": 0.3333333432674408, + "num_tokens": 478916.0, + "repeat_count": 0.0, + "routers_loss": 0.011689410544931889, + "skip_count": 0.0, + "step": 296, + "text_loss": 0.5878773927688599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.3991781626063986, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000594, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 482369.0, + "repeat_count": 0.0, + "routers_loss": 0.010772093199193478, + "skip_count": 0.0, + "step": 298, + "text_loss": 0.4424116313457489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4085705899618433, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000598, + "loss": 0.0787, + "macro_f1": 0.3076923191547394, + "num_tokens": 486049.0, + "repeat_count": 2.0, + "routers_loss": 0.23482851684093475, + "skip_count": 2.0, + "step": 300, + "text_loss": 0.21217775344848633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.417963017317288, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.000602, + "loss": 0.073, + "macro_f1": 0.3076923191547394, + "num_tokens": 488683.0, + "repeat_count": 1.0, + "routers_loss": 0.18843084573745728, + "skip_count": 3.0, + "step": 302, + "text_loss": 0.2109498232603073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4273554446727326, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.279296875, + "learning_rate": 0.000606, + "loss": 0.0945, + "macro_f1": 0.3144654333591461, + "num_tokens": 492010.0, + "repeat_count": 0.0, + "routers_loss": 0.17861786484718323, + "skip_count": 3.0, + "step": 304, + "text_loss": 0.8446305394172668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4367478720281772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00061, + "loss": 0.0827, + "macro_f1": 0.3333333432674408, + "num_tokens": 494764.0, + "repeat_count": 0.0, + "routers_loss": 0.014124520123004913, + "skip_count": 0.0, + "step": 306, + "text_loss": 0.742735743522644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4461402993836219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.000614, + "loss": 0.1071, + "macro_f1": 0.3333333432674408, + "num_tokens": 497820.0, + "repeat_count": 0.0, + "routers_loss": 0.017968112602829933, + "skip_count": 0.0, + "step": 308, + "text_loss": 0.28305482864379883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4555327267390665, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006180000000000001, + "loss": 0.0775, + "macro_f1": 0.32098764181137085, + "num_tokens": 500694.0, + "repeat_count": 0.0, + "routers_loss": 0.08593655377626419, + "skip_count": 2.0, + "step": 310, + "text_loss": 0.3496848940849304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.000622, + "loss": 0.061, + "macro_f1": 0.3333333432674408, + "num_tokens": 503871.0, + "repeat_count": 0.0, + "routers_loss": 0.016449492424726486, + "skip_count": 0.0, + "step": 312, + "text_loss": 0.6691372990608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4743175814499558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.000626, + "loss": 0.0815, + "macro_f1": 0.3333333432674408, + "num_tokens": 506730.0, + "repeat_count": 0.0, + "routers_loss": 0.014532964676618576, + "skip_count": 0.0, + "step": 314, + "text_loss": 0.6118118166923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.00063, + "loss": 0.0742, + "macro_f1": 0.3333333432674408, + "num_tokens": 510323.0, + "repeat_count": 0.0, + "routers_loss": 0.013093139044940472, + "skip_count": 0.0, + "step": 316, + "text_loss": 0.38126271963119507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.4931024361608454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.400390625, + "learning_rate": 0.000634, + "loss": 0.0915, + "macro_f1": 0.3333333432674408, + "num_tokens": 514075.0, + "repeat_count": 0.0, + "routers_loss": 0.008627045899629593, + "skip_count": 0.0, + "step": 318, + "text_loss": 0.5983037948608398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000638, + "loss": 0.1008, + "macro_f1": 0.3272727429866791, + "num_tokens": 517418.0, + "repeat_count": 0.0, + "routers_loss": 0.04561378434300423, + "skip_count": 1.0, + "step": 320, + "text_loss": 0.767257034778595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.5118872908717347, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000642, + "loss": 0.0926, + "macro_f1": 0.3272727429866791, + "num_tokens": 520443.0, + "repeat_count": 0.0, + "routers_loss": 0.024372953921556473, + "skip_count": 0.0, + "step": 322, + "text_loss": 0.6572105884552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5212797182271793, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.000646, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 523317.0, + "repeat_count": 1.0, + "routers_loss": 0.08099937438964844, + "skip_count": 0.0, + "step": 324, + "text_loss": 0.205499529838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.530672145582624, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006500000000000001, + "loss": 0.0809, + "macro_f1": 0.32098767161369324, + "num_tokens": 526355.0, + "repeat_count": 0.0, + "routers_loss": 0.0657225176692009, + "skip_count": 1.0, + "step": 326, + "text_loss": 0.2587239742279053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.0006540000000000001, + "loss": 0.0779, + "macro_f1": 0.3333333432674408, + "num_tokens": 529689.0, + "repeat_count": 0.0, + "routers_loss": 0.01849208027124405, + "skip_count": 0.0, + "step": 328, + "text_loss": 0.2172023057937622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006580000000000001, + "loss": 0.0758, + "macro_f1": 0.3333333432674408, + "num_tokens": 532603.0, + "repeat_count": 0.0, + "routers_loss": 0.016184113919734955, + "skip_count": 0.0, + "step": 330, + "text_loss": 0.5980568528175354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.000662, + "loss": 0.0439, + "macro_f1": 0.3333333432674408, + "num_tokens": 536056.0, + "repeat_count": 0.0, + "routers_loss": 0.01303898449987173, + "skip_count": 0.0, + "step": 332, + "text_loss": 0.5421966314315796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 1.5682418550044028, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.296875, + "learning_rate": 0.000666, + "loss": 0.0963, + "macro_f1": 0.465986430644989, + "num_tokens": 539231.0, + "repeat_count": 3.0, + "routers_loss": 0.3075675964355469, + "skip_count": 3.0, + "step": 334, + "text_loss": 0.19719554483890533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5776342823598473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00067, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 542038.0, + "repeat_count": 0.0, + "routers_loss": 0.009116224013268948, + "skip_count": 0.0, + "step": 336, + "text_loss": 0.3407036066055298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5870267097152921, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2421875, + "learning_rate": 0.000674, + "loss": 0.0768, + "macro_f1": 0.3333333432674408, + "num_tokens": 545019.0, + "repeat_count": 0.0, + "routers_loss": 0.021463042125105858, + "skip_count": 0.0, + "step": 338, + "text_loss": 0.24486012756824493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.5964191370707366, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006780000000000001, + "loss": 0.0889, + "macro_f1": 0.3333333432674408, + "num_tokens": 548036.0, + "repeat_count": 0.0, + "routers_loss": 0.01857556402683258, + "skip_count": 0.0, + "step": 340, + "text_loss": 0.28140124678611755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6058115644261814, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0006820000000000001, + "loss": 0.0617, + "macro_f1": 0.3006536364555359, + "num_tokens": 551419.0, + "repeat_count": 2.0, + "routers_loss": 0.27090007066726685, + "skip_count": 3.0, + "step": 342, + "text_loss": 0.20690307021141052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.615203991781626, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3046875, + "learning_rate": 0.0006860000000000001, + "loss": 0.1047, + "macro_f1": 0.32098764181137085, + "num_tokens": 554037.0, + "repeat_count": 0.0, + "routers_loss": 0.09231195598840714, + "skip_count": 2.0, + "step": 344, + "text_loss": 0.4479128420352936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6245964191370708, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.00069, + "loss": 0.0883, + "macro_f1": 0.3333333432674408, + "num_tokens": 556672.0, + "repeat_count": 0.0, + "routers_loss": 0.00935924518853426, + "skip_count": 0.0, + "step": 346, + "text_loss": 0.6377320289611816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6339888464925154, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.000694, + "loss": 0.0781, + "macro_f1": 0.32098764181137085, + "num_tokens": 559756.0, + "repeat_count": 0.0, + "routers_loss": 0.17641772329807281, + "skip_count": 2.0, + "step": 348, + "text_loss": 0.6097636222839355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 1.64338127384796, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.30078125, + "learning_rate": 0.0006979999999999999, + "loss": 0.0616, + "macro_f1": 0.5492662787437439, + "num_tokens": 563415.0, + "repeat_count": 0.0, + "routers_loss": 0.06240406632423401, + "skip_count": 2.0, + "step": 350, + "text_loss": 0.5291631817817688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6527737012034047, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.296875, + "learning_rate": 0.0007019999999999999, + "loss": 0.1026, + "macro_f1": 0.3333333432674408, + "num_tokens": 566357.0, + "repeat_count": 0.0, + "routers_loss": 0.012269247323274612, + "skip_count": 0.0, + "step": 352, + "text_loss": 0.5170195698738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6621661285588494, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007059999999999999, + "loss": 0.0815, + "macro_f1": 0.32098764181137085, + "num_tokens": 569449.0, + "repeat_count": 0.0, + "routers_loss": 0.07515309751033783, + "skip_count": 2.0, + "step": 354, + "text_loss": 0.34507250785827637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6715585559142943, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.00071, + "loss": 0.0791, + "macro_f1": 0.3144654333591461, + "num_tokens": 572761.0, + "repeat_count": 1.0, + "routers_loss": 0.20768006145954132, + "skip_count": 2.0, + "step": 356, + "text_loss": 0.3158532381057739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.000714, + "loss": 0.0682, + "macro_f1": 0.3333333432674408, + "num_tokens": 575909.0, + "repeat_count": 0.0, + "routers_loss": 0.025329967960715294, + "skip_count": 0.0, + "step": 358, + "text_loss": 0.21455390751361847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.6903434106251836, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.000718, + "loss": 0.0775, + "macro_f1": 0.32098767161369324, + "num_tokens": 579186.0, + "repeat_count": 1.0, + "routers_loss": 0.07676175981760025, + "skip_count": 0.0, + "step": 360, + "text_loss": 0.61895352602005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.699735837980628, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000722, + "loss": 0.0781, + "macro_f1": 0.32098767161369324, + "num_tokens": 582437.0, + "repeat_count": 0.0, + "routers_loss": 0.08070661872625351, + "skip_count": 1.0, + "step": 362, + "text_loss": 0.20557661354541779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7091282653360729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2216796875, + "learning_rate": 0.000726, + "loss": 0.11, + "macro_f1": 0.3333333432674408, + "num_tokens": 586096.0, + "repeat_count": 0.0, + "routers_loss": 0.015891313552856445, + "skip_count": 0.0, + "step": 364, + "text_loss": 0.597991943359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7185206926915173, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.00073, + "loss": 0.0573, + "macro_f1": 0.3076923191547394, + "num_tokens": 589520.0, + "repeat_count": 1.0, + "routers_loss": 0.12844261527061462, + "skip_count": 3.0, + "step": 366, + "text_loss": 0.2944789230823517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7279131200469622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000734, + "loss": 0.1005, + "macro_f1": 0.3333333432674408, + "num_tokens": 592691.0, + "repeat_count": 0.0, + "routers_loss": 0.02382199838757515, + "skip_count": 0.0, + "step": 368, + "text_loss": 0.23989969491958618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7373055474024068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.000738, + "loss": 0.0661, + "macro_f1": 0.3333333432674408, + "num_tokens": 596004.0, + "repeat_count": 0.0, + "routers_loss": 0.018812084570527077, + "skip_count": 0.0, + "step": 370, + "text_loss": 0.22111408412456512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000742, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 599087.0, + "repeat_count": 0.0, + "routers_loss": 0.08290331065654755, + "skip_count": 1.0, + "step": 372, + "text_loss": 0.2567356526851654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7560904021132961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2412109375, + "learning_rate": 0.000746, + "loss": 0.0941, + "macro_f1": 0.32098764181137085, + "num_tokens": 602330.0, + "repeat_count": 1.0, + "routers_loss": 0.11482042074203491, + "skip_count": 1.0, + "step": 374, + "text_loss": 0.7217292785644531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.7654828294687408, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2265625, + "learning_rate": 0.00075, + "loss": 0.0728, + "macro_f1": 0.3272727429866791, + "num_tokens": 605503.0, + "repeat_count": 1.0, + "routers_loss": 0.11849870532751083, + "skip_count": 0.0, + "step": 376, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 1.7748752568241855, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2333984375, + "learning_rate": 0.000754, + "loss": 0.0835, + "macro_f1": 0.32098767161369324, + "num_tokens": 608505.0, + "repeat_count": 0.0, + "routers_loss": 0.07090992480516434, + "skip_count": 1.0, + "step": 378, + "text_loss": 0.2204965502023697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.78426768417963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000758, + "loss": 0.0794, + "macro_f1": 0.3272727429866791, + "num_tokens": 611193.0, + "repeat_count": 0.0, + "routers_loss": 0.03812089189887047, + "skip_count": 1.0, + "step": 380, + "text_loss": 0.44909021258354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1689453125, + "learning_rate": 0.000762, + "loss": 0.0882, + "macro_f1": 0.3272727429866791, + "num_tokens": 614231.0, + "repeat_count": 1.0, + "routers_loss": 0.10270529240369797, + "skip_count": 0.0, + "step": 382, + "text_loss": 0.13624964654445648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8030525388905194, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.330078125, + "learning_rate": 0.0007660000000000001, + "loss": 0.1107, + "macro_f1": 0.32098764181137085, + "num_tokens": 617090.0, + "repeat_count": 1.0, + "routers_loss": 0.11624004691839218, + "skip_count": 1.0, + "step": 384, + "text_loss": 0.7314052581787109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8124449662459643, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007700000000000001, + "loss": 0.0628, + "macro_f1": 0.32098764181137085, + "num_tokens": 620596.0, + "repeat_count": 0.0, + "routers_loss": 0.07114322483539581, + "skip_count": 2.0, + "step": 386, + "text_loss": 0.503322958946228 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8218373936014087, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0007740000000000001, + "loss": 0.0829, + "macro_f1": 0.32098764181137085, + "num_tokens": 624108.0, + "repeat_count": 0.0, + "routers_loss": 0.06061873584985733, + "skip_count": 2.0, + "step": 388, + "text_loss": 0.11481904983520508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8312298209568536, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2099609375, + "learning_rate": 0.000778, + "loss": 0.0791, + "macro_f1": 0.3006536364555359, + "num_tokens": 626895.0, + "repeat_count": 1.0, + "routers_loss": 0.2921771705150604, + "skip_count": 4.0, + "step": 390, + "text_loss": 0.3069624602794647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8406222483122983, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.000782, + "loss": 0.0605, + "macro_f1": 0.3076923191547394, + "num_tokens": 630204.0, + "repeat_count": 0.0, + "routers_loss": 0.202707901597023, + "skip_count": 4.0, + "step": 392, + "text_loss": 0.6022785305976868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 0.000786, + "loss": 0.0877, + "macro_f1": 0.3333333432674408, + "num_tokens": 634373.0, + "repeat_count": 0.0, + "routers_loss": 0.0221510399132967, + "skip_count": 0.0, + "step": 394, + "text_loss": 0.26787394285202026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8594071030231876, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.37890625, + "learning_rate": 0.00079, + "loss": 0.0805, + "macro_f1": 0.32098764181137085, + "num_tokens": 637442.0, + "repeat_count": 2.0, + "routers_loss": 0.12636390328407288, + "skip_count": 0.0, + "step": 396, + "text_loss": 0.2799781560897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8687995303786322, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007940000000000001, + "loss": 0.0724, + "macro_f1": 0.32098764181137085, + "num_tokens": 641231.0, + "repeat_count": 0.0, + "routers_loss": 0.07933453470468521, + "skip_count": 2.0, + "step": 398, + "text_loss": 0.2507784366607666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8781919577340769, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007980000000000001, + "loss": 0.0909, + "macro_f1": 0.3272727429866791, + "num_tokens": 644560.0, + "repeat_count": 1.0, + "routers_loss": 0.10324911028146744, + "skip_count": 0.0, + "step": 400, + "text_loss": 0.7756280303001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8875843850895215, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0008020000000000001, + "loss": 0.0783, + "macro_f1": 0.3144654333591461, + "num_tokens": 647393.0, + "repeat_count": 1.0, + "routers_loss": 0.18546262383460999, + "skip_count": 2.0, + "step": 402, + "text_loss": 0.5013328194618225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.8969768124449664, + "f1_execute": 0.8571428656578064, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0008060000000000001, + "loss": 0.0787, + "macro_f1": 0.2857142984867096, + "num_tokens": 650355.0, + "repeat_count": 3.0, + "routers_loss": 0.3280293643474579, + "skip_count": 4.0, + "step": 404, + "text_loss": 0.2842077314853668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9063692398004108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.0008100000000000001, + "loss": 0.0901, + "macro_f1": 0.3333333432674408, + "num_tokens": 654280.0, + "repeat_count": 0.0, + "routers_loss": 0.02623247355222702, + "skip_count": 0.0, + "step": 406, + "text_loss": 0.46742817759513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0008139999999999999, + "loss": 0.0945, + "macro_f1": 0.3333333432674408, + "num_tokens": 657568.0, + "repeat_count": 0.0, + "routers_loss": 0.009744114242494106, + "skip_count": 0.0, + "step": 408, + "text_loss": 0.7168047428131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9251540945113002, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0008179999999999999, + "loss": 0.1065, + "macro_f1": 0.32098764181137085, + "num_tokens": 660593.0, + "repeat_count": 0.0, + "routers_loss": 0.07591600716114044, + "skip_count": 2.0, + "step": 410, + "text_loss": 0.449823260307312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0008219999999999999, + "loss": 0.0795, + "macro_f1": 0.3333333432674408, + "num_tokens": 663916.0, + "repeat_count": 0.0, + "routers_loss": 0.02076602540910244, + "skip_count": 0.0, + "step": 412, + "text_loss": 0.4764713943004608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9439389492221895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000826, + "loss": 0.0836, + "macro_f1": 0.3272727429866791, + "num_tokens": 667502.0, + "repeat_count": 0.0, + "routers_loss": 0.049170155078172684, + "skip_count": 1.0, + "step": 414, + "text_loss": 0.30333325266838074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9533313765776343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.00083, + "loss": 0.1021, + "macro_f1": 0.3272727429866791, + "num_tokens": 670510.0, + "repeat_count": 1.0, + "routers_loss": 0.15554003417491913, + "skip_count": 0.0, + "step": 416, + "text_loss": 0.3691870868206024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000834, + "loss": 0.1013, + "macro_f1": 0.3333333432674408, + "num_tokens": 674761.0, + "repeat_count": 0.0, + "routers_loss": 0.024516675621271133, + "skip_count": 0.0, + "step": 418, + "text_loss": 0.32850381731987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9721162312885236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.000838, + "loss": 0.0649, + "macro_f1": 0.3333333432674408, + "num_tokens": 678055.0, + "repeat_count": 0.0, + "routers_loss": 0.011026890948414803, + "skip_count": 0.0, + "step": 420, + "text_loss": 0.6637290716171265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.9815086586439683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000842, + "loss": 0.0771, + "macro_f1": 0.3272727429866791, + "num_tokens": 680979.0, + "repeat_count": 0.0, + "routers_loss": 0.07451887428760529, + "skip_count": 1.0, + "step": 422, + "text_loss": 0.27131685614585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 1.990901085999413, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000846, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 684144.0, + "repeat_count": 1.0, + "routers_loss": 0.11341800540685654, + "skip_count": 1.0, + "step": 424, + "text_loss": 0.652126669883728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.00085, + "loss": 0.0754, + "macro_f1": 0.3272727429866791, + "num_tokens": 687004.0, + "repeat_count": 1.0, + "routers_loss": 0.08985847979784012, + "skip_count": 0.0, + "step": 426, + "text_loss": 0.2589428424835205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.000854, + "loss": 0.0866, + "macro_f1": 0.3333333432674408, + "num_tokens": 689702.0, + "repeat_count": 0.0, + "routers_loss": 0.011355436407029629, + "skip_count": 0.0, + "step": 428, + "text_loss": 0.8909716010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0187848547108893, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.000858, + "loss": 0.0623, + "macro_f1": 0.3333333432674408, + "num_tokens": 692698.0, + "repeat_count": 0.0, + "routers_loss": 0.013788948766887188, + "skip_count": 0.0, + "step": 430, + "text_loss": 0.19141142070293427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.028177282066334, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000862, + "loss": 0.0499, + "macro_f1": 0.32098764181137085, + "num_tokens": 696007.0, + "repeat_count": 0.0, + "routers_loss": 0.07998392730951309, + "skip_count": 2.0, + "step": 432, + "text_loss": 0.1611809879541397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0375697094217786, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000866, + "loss": 0.0541, + "macro_f1": 0.32098764181137085, + "num_tokens": 700271.0, + "repeat_count": 0.0, + "routers_loss": 0.06988382339477539, + "skip_count": 2.0, + "step": 434, + "text_loss": 0.37254223227500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0469621367772235, + "f1_execute": 0.8333333730697632, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.00087, + "loss": 0.0834, + "macro_f1": 0.2777777910232544, + "num_tokens": 703519.0, + "repeat_count": 3.0, + "routers_loss": 0.28240787982940674, + "skip_count": 5.0, + "step": 436, + "text_loss": 0.29636648297309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.423828125, + "learning_rate": 0.000874, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 706826.0, + "repeat_count": 0.0, + "routers_loss": 0.013924967497587204, + "skip_count": 0.0, + "step": 438, + "text_loss": 0.20867908000946045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.000878, + "loss": 0.0657, + "macro_f1": 0.3333333432674408, + "num_tokens": 710530.0, + "repeat_count": 0.0, + "routers_loss": 0.01170142088085413, + "skip_count": 0.0, + "step": 440, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.0751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.000882, + "loss": 0.076, + "macro_f1": 0.3333333432674408, + "num_tokens": 713503.0, + "repeat_count": 0.0, + "routers_loss": 0.011930872686207294, + "skip_count": 0.0, + "step": 442, + "text_loss": 0.39314430952072144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0008860000000000001, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 716582.0, + "repeat_count": 0.0, + "routers_loss": 0.008630385622382164, + "skip_count": 0.0, + "step": 444, + "text_loss": 0.5925271511077881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.0939242735544465, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0008900000000000001, + "loss": 0.0811, + "macro_f1": 0.3006536066532135, + "num_tokens": 719941.0, + "repeat_count": 3.0, + "routers_loss": 0.3015584945678711, + "skip_count": 1.0, + "step": 446, + "text_loss": 0.5059905052185059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.1033167009098914, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.000894, + "loss": 0.0822, + "macro_f1": 0.31446540355682373, + "num_tokens": 723113.0, + "repeat_count": 1.0, + "routers_loss": 0.10897493362426758, + "skip_count": 1.0, + "step": 448, + "text_loss": 0.19616436958312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.112709128265336, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000898, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 726193.0, + "repeat_count": 0.0, + "routers_loss": 0.07236456125974655, + "skip_count": 2.0, + "step": 450, + "text_loss": 0.1773054152727127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1221015556207807, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3203125, + "learning_rate": 0.000902, + "loss": 0.058, + "macro_f1": 0.3272727429866791, + "num_tokens": 729275.0, + "repeat_count": 1.0, + "routers_loss": 0.08184371143579483, + "skip_count": 0.0, + "step": 452, + "text_loss": 0.4927310049533844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1314939829762256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000906, + "loss": 0.0607, + "macro_f1": 0.3333333432674408, + "num_tokens": 731948.0, + "repeat_count": 0.0, + "routers_loss": 0.014033539220690727, + "skip_count": 0.0, + "step": 454, + "text_loss": 0.4745742678642273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.00091, + "loss": 0.0651, + "macro_f1": 0.3333333432674408, + "num_tokens": 735351.0, + "repeat_count": 0.0, + "routers_loss": 0.0071774693205952644, + "skip_count": 0.0, + "step": 456, + "text_loss": 0.18523462116718292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.150278837687115, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.400390625, + "learning_rate": 0.0009140000000000001, + "loss": 0.0738, + "macro_f1": 0.5492662787437439, + "num_tokens": 738587.0, + "repeat_count": 0.0, + "routers_loss": 0.07781517505645752, + "skip_count": 2.0, + "step": 458, + "text_loss": 0.3459635376930237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 26.0, + "epoch": 2.1596712650425594, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009180000000000001, + "loss": 0.0723, + "macro_f1": 0.3076923191547394, + "num_tokens": 741779.0, + "repeat_count": 0.0, + "routers_loss": 0.09529037028551102, + "skip_count": 2.0, + "step": 460, + "text_loss": 0.20197433233261108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1690636923980042, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.0009220000000000001, + "loss": 0.0519, + "macro_f1": 0.3333333432674408, + "num_tokens": 745355.0, + "repeat_count": 0.0, + "routers_loss": 0.009765669703483582, + "skip_count": 0.0, + "step": 462, + "text_loss": 0.7031404376029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1784561197534487, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009260000000000001, + "loss": 0.0527, + "macro_f1": 0.3272727429866791, + "num_tokens": 748628.0, + "repeat_count": 0.0, + "routers_loss": 0.03344850242137909, + "skip_count": 1.0, + "step": 464, + "text_loss": 0.21274663507938385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.1878485471088935, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.00093, + "loss": 0.0534, + "macro_f1": 0.3076923191547394, + "num_tokens": 751472.0, + "repeat_count": 2.0, + "routers_loss": 0.1354292333126068, + "skip_count": 2.0, + "step": 466, + "text_loss": 0.5350717306137085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.197240974464338, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.000934, + "loss": 0.0598, + "macro_f1": 0.3272727429866791, + "num_tokens": 754479.0, + "repeat_count": 0.0, + "routers_loss": 0.056420840322971344, + "skip_count": 1.0, + "step": 468, + "text_loss": 0.28153330087661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.206633401819783, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009379999999999999, + "loss": 0.0597, + "macro_f1": 0.31446540355682373, + "num_tokens": 757872.0, + "repeat_count": 1.0, + "routers_loss": 0.1622387170791626, + "skip_count": 1.0, + "step": 470, + "text_loss": 0.22956843674182892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2160258291752273, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.5, + "learning_rate": 0.000942, + "loss": 0.0953, + "macro_f1": 0.32098764181137085, + "num_tokens": 760468.0, + "repeat_count": 0.0, + "routers_loss": 0.05146972835063934, + "skip_count": 2.0, + "step": 472, + "text_loss": 0.4513966739177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.225418256530672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.000946, + "loss": 0.0592, + "macro_f1": 0.3272727429866791, + "num_tokens": 763519.0, + "repeat_count": 1.0, + "routers_loss": 0.09022669494152069, + "skip_count": 0.0, + "step": 474, + "text_loss": 0.25758957862854004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.00095, + "loss": 0.0498, + "macro_f1": 0.3272727429866791, + "num_tokens": 767391.0, + "repeat_count": 0.0, + "routers_loss": 0.03044828027486801, + "skip_count": 1.0, + "step": 476, + "text_loss": 0.21366681158542633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2442031112415615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.291015625, + "learning_rate": 0.000954, + "loss": 0.0802, + "macro_f1": 0.3272727429866791, + "num_tokens": 770338.0, + "repeat_count": 0.0, + "routers_loss": 0.10397060960531235, + "skip_count": 1.0, + "step": 478, + "text_loss": 1.0396177768707275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2535955385970063, + "f1_execute": 0.8571429252624512, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.267578125, + "learning_rate": 0.000958, + "loss": 0.1099, + "macro_f1": 0.285714328289032, + "num_tokens": 773699.0, + "repeat_count": 2.0, + "routers_loss": 0.22604143619537354, + "skip_count": 4.0, + "step": 480, + "text_loss": 0.2570283114910126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.2629879659524508, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.146484375, + "learning_rate": 0.000962, + "loss": 0.0667, + "macro_f1": 0.32098767161369324, + "num_tokens": 777473.0, + "repeat_count": 0.0, + "routers_loss": 0.048258859664201736, + "skip_count": 1.0, + "step": 482, + "text_loss": 0.2540103495121002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.2723803933078957, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000966, + "loss": 0.0592, + "macro_f1": 0.3333333432674408, + "num_tokens": 780833.0, + "repeat_count": 0.0, + "routers_loss": 0.023018671199679375, + "skip_count": 0.0, + "step": 484, + "text_loss": 0.38524550199508667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.314453125, + "learning_rate": 0.0009699999999999999, + "loss": 0.0709, + "macro_f1": 0.3272727429866791, + "num_tokens": 783656.0, + "repeat_count": 0.0, + "routers_loss": 0.044845327734947205, + "skip_count": 1.0, + "step": 486, + "text_loss": 0.5859048366546631 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000974, + "loss": 0.0615, + "macro_f1": 0.3333333432674408, + "num_tokens": 787173.0, + "repeat_count": 0.0, + "routers_loss": 0.010898692533373833, + "skip_count": 0.0, + "step": 488, + "text_loss": 0.3456067442893982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3005576753742294, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000978, + "loss": 0.0796, + "macro_f1": 0.32098764181137085, + "num_tokens": 790395.0, + "repeat_count": 0.0, + "routers_loss": 0.06497956812381744, + "skip_count": 2.0, + "step": 490, + "text_loss": 0.3751123249530792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3099501027296743, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.000982, + "loss": 0.0772, + "macro_f1": 0.3272727429866791, + "num_tokens": 793137.0, + "repeat_count": 0.0, + "routers_loss": 0.07763728499412537, + "skip_count": 1.0, + "step": 492, + "text_loss": 0.43296709656715393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3193425300851187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009860000000000001, + "loss": 0.0819, + "macro_f1": 0.3333333432674408, + "num_tokens": 796497.0, + "repeat_count": 0.0, + "routers_loss": 0.02127906307578087, + "skip_count": 0.0, + "step": 494, + "text_loss": 0.4841311275959015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3287349574405636, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2138671875, + "learning_rate": 0.00099, + "loss": 0.073, + "macro_f1": 0.3272727429866791, + "num_tokens": 799361.0, + "repeat_count": 1.0, + "routers_loss": 0.09518691152334213, + "skip_count": 0.0, + "step": 496, + "text_loss": 0.5094487071037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.000994, + "loss": 0.0789, + "macro_f1": 0.5492662787437439, + "num_tokens": 802629.0, + "repeat_count": 0.0, + "routers_loss": 0.0563947930932045, + "skip_count": 2.0, + "step": 498, + "text_loss": 0.42783617973327637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000998, + "loss": 0.0476, + "macro_f1": 0.3272727429866791, + "num_tokens": 805881.0, + "repeat_count": 1.0, + "routers_loss": 0.10570426285266876, + "skip_count": 0.0, + "step": 500, + "text_loss": 0.28395503759384155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.3569122395068973, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009999999760498814, + "loss": 0.0849, + "macro_f1": 0.5492662787437439, + "num_tokens": 809283.0, + "repeat_count": 0.0, + "routers_loss": 0.031202208250761032, + "skip_count": 2.0, + "step": 502, + "text_loss": 0.32970911264419556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.366304666862342, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009999997844489475, + "loss": 0.0574, + "macro_f1": 0.3272727429866791, + "num_tokens": 812440.0, + "repeat_count": 0.0, + "routers_loss": 0.07647835463285446, + "skip_count": 1.0, + "step": 504, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.375697094217787, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25, + "learning_rate": 0.000999999401247153, + "loss": 0.0668, + "macro_f1": 0.32098764181137085, + "num_tokens": 815716.0, + "repeat_count": 0.0, + "routers_loss": 0.08515176922082901, + "skip_count": 2.0, + "step": 506, + "text_loss": 0.6157599687576294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.0009999988264446445, + "loss": 0.0686, + "macro_f1": 0.3333333432674408, + "num_tokens": 819086.0, + "repeat_count": 0.0, + "routers_loss": 0.00946938619017601, + "skip_count": 0.0, + "step": 508, + "text_loss": 0.5053519010543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.3944819489286764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009999980600416424, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 822268.0, + "repeat_count": 0.0, + "routers_loss": 0.01058756373822689, + "skip_count": 0.0, + "step": 510, + "text_loss": 0.5570021867752075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000999997102038441, + "loss": 0.0678, + "macro_f1": 0.3333333432674408, + "num_tokens": 825728.0, + "repeat_count": 0.0, + "routers_loss": 0.008705209009349346, + "skip_count": 0.0, + "step": 512, + "text_loss": 0.6519040465354919 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4132668036395657, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.220703125, + "learning_rate": 0.0009999959524354064, + "loss": 0.083, + "macro_f1": 0.3272727429866791, + "num_tokens": 829459.0, + "repeat_count": 0.0, + "routers_loss": 0.04024193435907364, + "skip_count": 1.0, + "step": 514, + "text_loss": 0.5290043950080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.25390625, + "learning_rate": 0.00099999461123298, + "loss": 0.0727, + "macro_f1": 0.3333333432674408, + "num_tokens": 832291.0, + "repeat_count": 0.0, + "routers_loss": 0.015742862597107887, + "skip_count": 0.0, + "step": 516, + "text_loss": 0.7910057902336121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.432051658350455, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000999993078431675, + "loss": 0.0759, + "macro_f1": 0.3076923191547394, + "num_tokens": 835399.0, + "repeat_count": 1.0, + "routers_loss": 0.16753782331943512, + "skip_count": 3.0, + "step": 518, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.4414440857058994, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.236328125, + "learning_rate": 0.0009999913540320792, + "loss": 0.0968, + "macro_f1": 0.31446540355682373, + "num_tokens": 838993.0, + "repeat_count": 0.0, + "routers_loss": 0.09357143193483353, + "skip_count": 2.0, + "step": 520, + "text_loss": 0.5499435663223267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.4508365130613443, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2451171875, + "learning_rate": 0.0009999894380348536, + "loss": 0.0821, + "macro_f1": 0.5492662787437439, + "num_tokens": 842652.0, + "repeat_count": 0.0, + "routers_loss": 0.056803856045007706, + "skip_count": 2.0, + "step": 522, + "text_loss": 0.197520449757576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.4602289404167887, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2333984375, + "learning_rate": 0.000999987330440732, + "loss": 0.0725, + "macro_f1": 0.4871794879436493, + "num_tokens": 847061.0, + "repeat_count": 0.0, + "routers_loss": 0.08962195366621017, + "skip_count": 3.0, + "step": 524, + "text_loss": 0.27509039640426636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4696213677722336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000999985031250522, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 850780.0, + "repeat_count": 0.0, + "routers_loss": 0.022930558770895004, + "skip_count": 0.0, + "step": 526, + "text_loss": 0.13291706144809723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.4790137951276785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.0009999825404651053, + "loss": 0.0614, + "macro_f1": 0.3333333432674408, + "num_tokens": 853886.0, + "repeat_count": 0.0, + "routers_loss": 0.017097990959882736, + "skip_count": 0.0, + "step": 528, + "text_loss": 0.21706295013427734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.212890625, + "learning_rate": 0.0009999798580854356, + "loss": 0.0724, + "macro_f1": 0.3333333432674408, + "num_tokens": 857364.0, + "repeat_count": 0.0, + "routers_loss": 0.02831801027059555, + "skip_count": 0.0, + "step": 530, + "text_loss": 0.9035662412643433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.000999976984112541, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 860661.0, + "repeat_count": 0.0, + "routers_loss": 0.019671892747282982, + "skip_count": 0.0, + "step": 532, + "text_loss": 0.8354863524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.5071910771940122, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 0.0009999739185475231, + "loss": 0.0963, + "macro_f1": 0.47333335876464844, + "num_tokens": 864124.0, + "repeat_count": 2.0, + "routers_loss": 0.21383361518383026, + "skip_count": 3.0, + "step": 534, + "text_loss": 0.23422949016094208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.516583504549457, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999706613915565, + "loss": 0.0598, + "macro_f1": 0.32098767161369324, + "num_tokens": 866976.0, + "repeat_count": 0.0, + "routers_loss": 0.07158871740102768, + "skip_count": 1.0, + "step": 536, + "text_loss": 0.11800774186849594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5259759319049016, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009999672126458894, + "loss": 0.0822, + "macro_f1": 0.3272727429866791, + "num_tokens": 870549.0, + "repeat_count": 0.0, + "routers_loss": 0.08185924589633942, + "skip_count": 1.0, + "step": 538, + "text_loss": 0.19232480227947235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5353683592603464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000999963572311843, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 873733.0, + "repeat_count": 0.0, + "routers_loss": 0.01633382774889469, + "skip_count": 0.0, + "step": 540, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.544760786615791, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009999597403908128, + "loss": 0.0761, + "macro_f1": 0.3272727429866791, + "num_tokens": 877099.0, + "repeat_count": 0.0, + "routers_loss": 0.0782657191157341, + "skip_count": 1.0, + "step": 542, + "text_loss": 0.17589199542999268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 2.5541532139712357, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009999557168842669, + "loss": 0.0716, + "macro_f1": 0.5492662787437439, + "num_tokens": 879883.0, + "repeat_count": 0.0, + "routers_loss": 0.05275818333029747, + "skip_count": 2.0, + "step": 544, + "text_loss": 0.26448264718055725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.56354564132668, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2490234375, + "learning_rate": 0.0009999515017937468, + "loss": 0.071, + "macro_f1": 0.32098764181137085, + "num_tokens": 882223.0, + "repeat_count": 0.0, + "routers_loss": 0.09335892647504807, + "skip_count": 2.0, + "step": 546, + "text_loss": 0.208544060587883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.572938068682125, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 0.0009999470951208684, + "loss": 0.0855, + "macro_f1": 0.32098764181137085, + "num_tokens": 885241.0, + "repeat_count": 2.0, + "routers_loss": 0.22983254492282867, + "skip_count": 0.0, + "step": 548, + "text_loss": 0.6612338423728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.00099994249686732, + "loss": 0.0786, + "macro_f1": 0.3272727429866791, + "num_tokens": 887897.0, + "repeat_count": 1.0, + "routers_loss": 0.12858282029628754, + "skip_count": 0.0, + "step": 550, + "text_loss": 0.4673548936843872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.5917229233930144, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009999377070348638, + "loss": 0.0944, + "macro_f1": 0.3333333432674408, + "num_tokens": 891224.0, + "repeat_count": 0.0, + "routers_loss": 0.017421770840883255, + "skip_count": 0.0, + "step": 552, + "text_loss": 0.6419258117675781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.601115350748459, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.000999932725625335, + "loss": 0.0791, + "macro_f1": 0.32098764181137085, + "num_tokens": 894578.0, + "repeat_count": 0.0, + "routers_loss": 0.07890026271343231, + "skip_count": 2.0, + "step": 554, + "text_loss": 0.5970752239227295 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.6105077781039037, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.0009999275526406427, + "loss": 0.0796, + "macro_f1": 0.31446540355682373, + "num_tokens": 897145.0, + "repeat_count": 1.0, + "routers_loss": 0.09836960583925247, + "skip_count": 1.0, + "step": 556, + "text_loss": 0.752425491809845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6199002054593485, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1875, + "learning_rate": 0.0009999221880827693, + "loss": 0.0882, + "macro_f1": 0.3333333432674408, + "num_tokens": 900565.0, + "repeat_count": 0.0, + "routers_loss": 0.017694659531116486, + "skip_count": 0.0, + "step": 558, + "text_loss": 0.195619136095047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2021484375, + "learning_rate": 0.0009999166319537703, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 903506.0, + "repeat_count": 0.0, + "routers_loss": 0.019375264644622803, + "skip_count": 0.0, + "step": 560, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 2.638685060170238, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.146484375, + "learning_rate": 0.0009999108842557748, + "loss": 0.0953, + "macro_f1": 0.4871794879436493, + "num_tokens": 906380.0, + "repeat_count": 0.0, + "routers_loss": 0.12013207376003265, + "skip_count": 3.0, + "step": 562, + "text_loss": 0.6279402375221252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6480774875256823, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009999049449909854, + "loss": 0.0799, + "macro_f1": 0.3272727429866791, + "num_tokens": 909116.0, + "repeat_count": 0.0, + "routers_loss": 0.06441342830657959, + "skip_count": 1.0, + "step": 564, + "text_loss": 0.23741699755191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.657469914881127, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009998988141616781, + "loss": 0.064, + "macro_f1": 0.32098767161369324, + "num_tokens": 912189.0, + "repeat_count": 0.0, + "routers_loss": 0.08309414982795715, + "skip_count": 1.0, + "step": 566, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6668623422365716, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009998924917702023, + "loss": 0.0876, + "macro_f1": 0.3272727429866791, + "num_tokens": 916279.0, + "repeat_count": 1.0, + "routers_loss": 0.07197169959545135, + "skip_count": 0.0, + "step": 568, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6762547695920165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.0009998859778189806, + "loss": 0.0706, + "macro_f1": 0.3333333432674408, + "num_tokens": 919490.0, + "repeat_count": 0.0, + "routers_loss": 0.008022273890674114, + "skip_count": 0.0, + "step": 570, + "text_loss": 0.6028938889503479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.6856471969474613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.000999879272310509, + "loss": 0.084, + "macro_f1": 0.3333333432674408, + "num_tokens": 923694.0, + "repeat_count": 0.0, + "routers_loss": 0.01634674146771431, + "skip_count": 0.0, + "step": 572, + "text_loss": 0.7177054286003113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.0009998723752473574, + "loss": 0.0716, + "macro_f1": 0.3272727429866791, + "num_tokens": 926933.0, + "repeat_count": 0.0, + "routers_loss": 0.060559045523405075, + "skip_count": 1.0, + "step": 574, + "text_loss": 0.5203254818916321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.185546875, + "learning_rate": 0.0009998652866321687, + "loss": 0.0801, + "macro_f1": 0.3333333432674408, + "num_tokens": 929832.0, + "repeat_count": 0.0, + "routers_loss": 0.011485611088573933, + "skip_count": 0.0, + "step": 576, + "text_loss": 0.6147452592849731 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.713824479013795, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.000999858006467659, + "loss": 0.0649, + "macro_f1": 0.29333335161209106, + "num_tokens": 933266.0, + "repeat_count": 2.0, + "routers_loss": 0.2929030954837799, + "skip_count": 4.0, + "step": 578, + "text_loss": 0.1720666140317917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.72321690636924, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.24609375, + "learning_rate": 0.0009998505347566186, + "loss": 0.0782, + "macro_f1": 0.32098764181137085, + "num_tokens": 937545.0, + "repeat_count": 0.0, + "routers_loss": 0.053780000656843185, + "skip_count": 2.0, + "step": 580, + "text_loss": 0.3258405327796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7326093337246844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.00099984287150191, + "loss": 0.0582, + "macro_f1": 0.3333333432674408, + "num_tokens": 941001.0, + "repeat_count": 0.0, + "routers_loss": 0.02637636847794056, + "skip_count": 0.0, + "step": 582, + "text_loss": 0.23762771487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7420017610801293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009998350167064705, + "loss": 0.0672, + "macro_f1": 0.3333333432674408, + "num_tokens": 943989.0, + "repeat_count": 0.0, + "routers_loss": 0.01637580618262291, + "skip_count": 0.0, + "step": 584, + "text_loss": 0.7460582852363586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7513941884355737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009998269703733096, + "loss": 0.0686, + "macro_f1": 0.3272727429866791, + "num_tokens": 947245.0, + "repeat_count": 1.0, + "routers_loss": 0.13934117555618286, + "skip_count": 0.0, + "step": 586, + "text_loss": 0.5284690260887146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7607866157910186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009998187325055106, + "loss": 0.0667, + "macro_f1": 0.3333333432674408, + "num_tokens": 950116.0, + "repeat_count": 0.0, + "routers_loss": 0.02138397842645645, + "skip_count": 0.0, + "step": 588, + "text_loss": 0.3920256197452545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009998103031062305, + "loss": 0.0778, + "macro_f1": 0.3333333432674408, + "num_tokens": 953277.0, + "repeat_count": 0.0, + "routers_loss": 0.007098200265318155, + "skip_count": 0.0, + "step": 590, + "text_loss": 0.7472905516624451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.779571470501908, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.318359375, + "learning_rate": 0.0009998016821786994, + "loss": 0.0872, + "macro_f1": 0.32098764181137085, + "num_tokens": 958229.0, + "repeat_count": 1.0, + "routers_loss": 0.07946522533893585, + "skip_count": 1.0, + "step": 592, + "text_loss": 0.5506448745727539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.7889638978573528, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000999792869726221, + "loss": 0.0523, + "macro_f1": 0.3272727429866791, + "num_tokens": 961016.0, + "repeat_count": 0.0, + "routers_loss": 0.0850791186094284, + "skip_count": 1.0, + "step": 594, + "text_loss": 0.3824431002140045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009997838657521717, + "loss": 0.0632, + "macro_f1": 0.3333333432674408, + "num_tokens": 963847.0, + "repeat_count": 0.0, + "routers_loss": 0.016370445489883423, + "skip_count": 0.0, + "step": 596, + "text_loss": 0.2139475792646408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.8077487525682416, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009997746702600026, + "loss": 0.0702, + "macro_f1": 0.307692289352417, + "num_tokens": 966619.0, + "repeat_count": 0.0, + "routers_loss": 0.1310746818780899, + "skip_count": 3.0, + "step": 598, + "text_loss": 0.3651018440723419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8171411799236865, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23828125, + "learning_rate": 0.0009997652832532372, + "loss": 0.0792, + "macro_f1": 0.3272727429866791, + "num_tokens": 970418.0, + "repeat_count": 1.0, + "routers_loss": 0.14303378760814667, + "skip_count": 0.0, + "step": 600, + "text_loss": 0.7094736099243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8265336072791314, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009997557047354722, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 973491.0, + "repeat_count": 0.0, + "routers_loss": 0.03334212675690651, + "skip_count": 1.0, + "step": 602, + "text_loss": 0.4812237024307251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 2.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.0009997459347103783, + "loss": 0.0956, + "macro_f1": 0.3272727429866791, + "num_tokens": 976672.0, + "repeat_count": 0.0, + "routers_loss": 0.02831871062517166, + "skip_count": 0.0, + "step": 604, + "text_loss": 0.21737146377563477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8453184619900207, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009997359731816998, + "loss": 0.0646, + "macro_f1": 0.3333333432674408, + "num_tokens": 979898.0, + "repeat_count": 0.0, + "routers_loss": 0.017968013882637024, + "skip_count": 0.0, + "step": 606, + "text_loss": 0.5458008050918579 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.854710889345465, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.224609375, + "learning_rate": 0.0009997258201532536, + "loss": 0.0751, + "macro_f1": 0.3333333432674408, + "num_tokens": 982811.0, + "repeat_count": 0.0, + "routers_loss": 0.016256732866168022, + "skip_count": 0.0, + "step": 608, + "text_loss": 0.8643257021903992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009997154756289303, + "loss": 0.0561, + "macro_f1": 0.3333333432674408, + "num_tokens": 985245.0, + "repeat_count": 0.0, + "routers_loss": 0.021214161068201065, + "skip_count": 0.0, + "step": 610, + "text_loss": 0.2204967886209488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8734957440563544, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.000999704939612694, + "loss": 0.0636, + "macro_f1": 0.3006536364555359, + "num_tokens": 988539.0, + "repeat_count": 3.0, + "routers_loss": 0.23249399662017822, + "skip_count": 2.0, + "step": 612, + "text_loss": 0.32489025592803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8828881714117993, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009996942121085824, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 991660.0, + "repeat_count": 0.0, + "routers_loss": 0.010706410743296146, + "skip_count": 0.0, + "step": 614, + "text_loss": 0.4551754891872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.8922805987672437, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3671875, + "learning_rate": 0.000999683293120706, + "loss": 0.1016, + "macro_f1": 0.3333333432674408, + "num_tokens": 994828.0, + "repeat_count": 0.0, + "routers_loss": 0.006676184479147196, + "skip_count": 0.0, + "step": 616, + "text_loss": 0.6212068200111389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9016730261226886, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.408203125, + "learning_rate": 0.0009996721826532491, + "loss": 0.0976, + "macro_f1": 0.3076923191547394, + "num_tokens": 997951.0, + "repeat_count": 2.0, + "routers_loss": 0.2148125320672989, + "skip_count": 2.0, + "step": 618, + "text_loss": 0.26514527201652527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1904296875, + "learning_rate": 0.000999660880710469, + "loss": 0.0909, + "macro_f1": 0.3333333432674408, + "num_tokens": 1001139.0, + "repeat_count": 0.0, + "routers_loss": 0.022332455962896347, + "skip_count": 0.0, + "step": 620, + "text_loss": 0.26131340861320496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.920457880833578, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009996493872966971, + "loss": 0.0732, + "macro_f1": 0.3272727429866791, + "num_tokens": 1003678.0, + "repeat_count": 1.0, + "routers_loss": 0.08348730951547623, + "skip_count": 0.0, + "step": 622, + "text_loss": 0.19151706993579865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009996377024163374, + "loss": 0.0822, + "macro_f1": 0.3333333432674408, + "num_tokens": 1007082.0, + "repeat_count": 0.0, + "routers_loss": 0.028577150776982307, + "skip_count": 0.0, + "step": 624, + "text_loss": 0.305387407541275 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9392427355444672, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009996258260738676, + "loss": 0.0892, + "macro_f1": 0.3272727429866791, + "num_tokens": 1010064.0, + "repeat_count": 1.0, + "routers_loss": 0.08312026411294937, + "skip_count": 0.0, + "step": 626, + "text_loss": 0.49436143040657043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9486351628999117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009996137582738388, + "loss": 0.0591, + "macro_f1": 0.3333333432674408, + "num_tokens": 1013462.0, + "repeat_count": 0.0, + "routers_loss": 0.013337327167391777, + "skip_count": 0.0, + "step": 628, + "text_loss": 0.6515294313430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9580275902553566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000999601499020875, + "loss": 0.0537, + "macro_f1": 0.3333333432674408, + "num_tokens": 1016246.0, + "repeat_count": 0.0, + "routers_loss": 0.029126765206456184, + "skip_count": 0.0, + "step": 630, + "text_loss": 0.18834827840328217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9674200176108014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009995890483196746, + "loss": 0.0602, + "macro_f1": 0.3272727429866791, + "num_tokens": 1019286.0, + "repeat_count": 0.0, + "routers_loss": 0.054844800382852554, + "skip_count": 1.0, + "step": 632, + "text_loss": 0.6988179087638855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 0.0009995764061750086, + "loss": 0.0767, + "macro_f1": 0.3333333432674408, + "num_tokens": 1022207.0, + "repeat_count": 0.0, + "routers_loss": 0.010095693171024323, + "skip_count": 0.0, + "step": 634, + "text_loss": 0.558451771736145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.9862048723216907, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000999563572591721, + "loss": 0.0521, + "macro_f1": 0.32098764181137085, + "num_tokens": 1025319.0, + "repeat_count": 1.0, + "routers_loss": 0.0698433518409729, + "skip_count": 1.0, + "step": 636, + "text_loss": 0.5961872935295105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 2.995597299677135, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009995505475747302, + "loss": 0.0849, + "macro_f1": 0.3272727429866791, + "num_tokens": 1028362.0, + "repeat_count": 0.0, + "routers_loss": 0.040211405605077744, + "skip_count": 1.0, + "step": 638, + "text_loss": 0.546863317489624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.004696213677722, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009995373311290272, + "loss": 0.0709, + "macro_f1": 0.3144654333591461, + "num_tokens": 1032199.0, + "repeat_count": 2.0, + "routers_loss": 0.1457643061876297, + "skip_count": 1.0, + "step": 640, + "text_loss": 0.2137298285961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009995239232596764, + "loss": 0.0545, + "macro_f1": 0.3333333432674408, + "num_tokens": 1035801.0, + "repeat_count": 0.0, + "routers_loss": 0.011394930072128773, + "skip_count": 0.0, + "step": 642, + "text_loss": 0.43054503202438354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009995103239718163, + "loss": 0.0665, + "macro_f1": 0.3333333432674408, + "num_tokens": 1039223.0, + "repeat_count": 0.0, + "routers_loss": 0.00997432041913271, + "skip_count": 0.0, + "step": 644, + "text_loss": 0.7749615907669067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0328734957440564, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.0009994965332706573, + "loss": 0.0755, + "macro_f1": 0.3144654333591461, + "num_tokens": 1042154.0, + "repeat_count": 3.0, + "routers_loss": 0.10589150339365005, + "skip_count": 0.0, + "step": 646, + "text_loss": 0.7812211513519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.042265923099501, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.0009994825511614846, + "loss": 0.0383, + "macro_f1": 0.3272727429866791, + "num_tokens": 1045250.0, + "repeat_count": 0.0, + "routers_loss": 0.0748734176158905, + "skip_count": 1.0, + "step": 648, + "text_loss": 0.844803512096405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0516583504549457, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009994683776496562, + "loss": 0.0433, + "macro_f1": 0.3272727429866791, + "num_tokens": 1048446.0, + "repeat_count": 0.0, + "routers_loss": 0.03742415830492973, + "skip_count": 1.0, + "step": 650, + "text_loss": 0.2098839282989502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0610507778103906, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009994540127406034, + "loss": 0.0591, + "macro_f1": 0.32098764181137085, + "num_tokens": 1051840.0, + "repeat_count": 0.0, + "routers_loss": 0.06025516986846924, + "skip_count": 2.0, + "step": 652, + "text_loss": 0.27727583050727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.070443205165835, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.181640625, + "learning_rate": 0.0009994394564398306, + "loss": 0.0519, + "macro_f1": 0.521541953086853, + "num_tokens": 1055142.0, + "repeat_count": 4.0, + "routers_loss": 0.22807340323925018, + "skip_count": 2.0, + "step": 654, + "text_loss": 0.9672397971153259 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009994247087529158, + "loss": 0.0618, + "macro_f1": 0.3333333432674408, + "num_tokens": 1057698.0, + "repeat_count": 0.0, + "routers_loss": 0.01348950993269682, + "skip_count": 0.0, + "step": 656, + "text_loss": 0.6375506520271301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.0892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009994097696855106, + "loss": 0.0412, + "macro_f1": 0.3333333432674408, + "num_tokens": 1060624.0, + "repeat_count": 0.0, + "routers_loss": 0.009649243205785751, + "skip_count": 0.0, + "step": 658, + "text_loss": 0.5315385460853577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.098620487232169, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2041015625, + "learning_rate": 0.0009993946392433395, + "loss": 0.0609, + "macro_f1": 0.307692289352417, + "num_tokens": 1065076.0, + "repeat_count": 0.0, + "routers_loss": 0.1250980943441391, + "skip_count": 3.0, + "step": 660, + "text_loss": 0.25780341029167175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1080129145876136, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009993793174322006, + "loss": 0.0471, + "macro_f1": 0.3333333432674408, + "num_tokens": 1068365.0, + "repeat_count": 0.0, + "routers_loss": 0.011544390581548214, + "skip_count": 0.0, + "step": 662, + "text_loss": 0.34876301884651184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1174053419430585, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009993638042579654, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1071693.0, + "repeat_count": 0.0, + "routers_loss": 0.03777370601892471, + "skip_count": 1.0, + "step": 664, + "text_loss": 0.21811571717262268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.203125, + "learning_rate": 0.0009993480997265783, + "loss": 0.0475, + "macro_f1": 0.5492662787437439, + "num_tokens": 1074733.0, + "repeat_count": 0.0, + "routers_loss": 0.049949806183576584, + "skip_count": 2.0, + "step": 666, + "text_loss": 0.38410288095474243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.136190196653948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.0009993322038440572, + "loss": 0.0605, + "macro_f1": 0.3333333432674408, + "num_tokens": 1077993.0, + "repeat_count": 0.0, + "routers_loss": 0.0247171800583601, + "skip_count": 0.0, + "step": 668, + "text_loss": 0.25576895475387573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1455826240093923, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.216796875, + "learning_rate": 0.000999316116616494, + "loss": 0.0619, + "macro_f1": 0.3333333432674408, + "num_tokens": 1080491.0, + "repeat_count": 0.0, + "routers_loss": 0.008118715137243271, + "skip_count": 0.0, + "step": 670, + "text_loss": 0.6269792914390564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.154975051364837, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009992998380500527, + "loss": 0.0462, + "macro_f1": 0.3272727429866791, + "num_tokens": 1083817.0, + "repeat_count": 0.0, + "routers_loss": 0.03366057574748993, + "skip_count": 1.0, + "step": 672, + "text_loss": 0.26891493797302246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1643674787202816, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992833681509716, + "loss": 0.0529, + "macro_f1": 0.3333333432674408, + "num_tokens": 1087368.0, + "repeat_count": 0.0, + "routers_loss": 0.020552074536681175, + "skip_count": 0.0, + "step": 674, + "text_loss": 0.14421936869621277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.1737599060757264, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009992667069255619, + "loss": 0.0696, + "macro_f1": 0.31446540355682373, + "num_tokens": 1090452.0, + "repeat_count": 0.0, + "routers_loss": 0.06937336176633835, + "skip_count": 2.0, + "step": 676, + "text_loss": 0.24999259412288666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.1831523334311713, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.0009992498543802085, + "loss": 0.0588, + "macro_f1": 0.3272727429866791, + "num_tokens": 1093996.0, + "repeat_count": 1.0, + "routers_loss": 0.0380021296441555, + "skip_count": 0.0, + "step": 678, + "text_loss": 0.42473849654197693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.1925447607866158, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009992328105213688, + "loss": 0.0411, + "macro_f1": 0.4400000274181366, + "num_tokens": 1096837.0, + "repeat_count": 1.0, + "routers_loss": 0.20885063707828522, + "skip_count": 4.0, + "step": 680, + "text_loss": 0.3829527199268341 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2019371881420606, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009992155753555747, + "loss": 0.0722, + "macro_f1": 0.5492662787437439, + "num_tokens": 1100320.0, + "repeat_count": 0.0, + "routers_loss": 0.018230699002742767, + "skip_count": 2.0, + "step": 682, + "text_loss": 0.6190969944000244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.211329615497505, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30859375, + "learning_rate": 0.0009991981488894303, + "loss": 0.0681, + "macro_f1": 0.32098767161369324, + "num_tokens": 1103682.0, + "repeat_count": 0.0, + "routers_loss": 0.05550144240260124, + "skip_count": 1.0, + "step": 684, + "text_loss": 0.44418027997016907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.22072204285295, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2158203125, + "learning_rate": 0.0009991805311296133, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1106427.0, + "repeat_count": 0.0, + "routers_loss": 0.07990608364343643, + "skip_count": 2.0, + "step": 686, + "text_loss": 0.5577231645584106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2301144702083944, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009991627220828753, + "loss": 0.0568, + "macro_f1": 0.32098764181137085, + "num_tokens": 1109314.0, + "repeat_count": 0.0, + "routers_loss": 0.05167485028505325, + "skip_count": 2.0, + "step": 688, + "text_loss": 0.27325430512428284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.2395068975638392, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009991447217560408, + "loss": 0.0521, + "macro_f1": 0.5492662787437439, + "num_tokens": 1112748.0, + "repeat_count": 0.0, + "routers_loss": 0.04621964320540428, + "skip_count": 2.0, + "step": 690, + "text_loss": 0.5288321375846863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.2488993249192837, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.000999126530156007, + "loss": 0.0499, + "macro_f1": 0.307692289352417, + "num_tokens": 1116965.0, + "repeat_count": 1.0, + "routers_loss": 0.11950276792049408, + "skip_count": 2.0, + "step": 692, + "text_loss": 0.14215624332427979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2582917522747286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009991081472897454, + "loss": 0.0722, + "macro_f1": 0.3333333432674408, + "num_tokens": 1120570.0, + "repeat_count": 0.0, + "routers_loss": 0.01905500330030918, + "skip_count": 0.0, + "step": 694, + "text_loss": 0.41862696409225464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009990895731643002, + "loss": 0.0464, + "macro_f1": 0.3272727429866791, + "num_tokens": 1124009.0, + "repeat_count": 1.0, + "routers_loss": 0.06974572688341141, + "skip_count": 0.0, + "step": 696, + "text_loss": 0.41160130500793457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.277076606985618, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.000999070807786789, + "loss": 0.0531, + "macro_f1": 0.3272727429866791, + "num_tokens": 1127370.0, + "repeat_count": 1.0, + "routers_loss": 0.07055293023586273, + "skip_count": 0.0, + "step": 698, + "text_loss": 0.48068273067474365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.2864690343410627, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.197265625, + "learning_rate": 0.000999051851164403, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1130234.0, + "repeat_count": 1.0, + "routers_loss": 0.12506946921348572, + "skip_count": 1.0, + "step": 700, + "text_loss": 0.47925490140914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1943359375, + "learning_rate": 0.000999032703304406, + "loss": 0.0674, + "macro_f1": 0.3333333432674408, + "num_tokens": 1132874.0, + "repeat_count": 0.0, + "routers_loss": 0.00809287466108799, + "skip_count": 0.0, + "step": 702, + "text_loss": 0.47433632612228394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.305253889051952, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009990133642141358, + "loss": 0.0497, + "macro_f1": 0.5492662787437439, + "num_tokens": 1136011.0, + "repeat_count": 0.0, + "routers_loss": 0.0319170281291008, + "skip_count": 2.0, + "step": 704, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3146463164073965, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.33984375, + "learning_rate": 0.000998993833901003, + "loss": 0.0619, + "macro_f1": 0.32098764181137085, + "num_tokens": 1139674.0, + "repeat_count": 0.0, + "routers_loss": 0.09850362688302994, + "skip_count": 2.0, + "step": 706, + "text_loss": 0.7660127282142639 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3240387437628414, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009989741123724919, + "loss": 0.0574, + "macro_f1": 0.3333333432674408, + "num_tokens": 1143558.0, + "repeat_count": 0.0, + "routers_loss": 0.006673311349004507, + "skip_count": 0.0, + "step": 708, + "text_loss": 0.5976111888885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009989541996361594, + "loss": 0.045, + "macro_f1": 0.3333333432674408, + "num_tokens": 1146122.0, + "repeat_count": 0.0, + "routers_loss": 0.004988791421055794, + "skip_count": 0.0, + "step": 710, + "text_loss": 0.5256119966506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3428235984737307, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009989340956996367, + "loss": 0.0528, + "macro_f1": 0.3333333432674408, + "num_tokens": 1149546.0, + "repeat_count": 0.0, + "routers_loss": 0.0067769973538815975, + "skip_count": 0.0, + "step": 712, + "text_loss": 0.5040497779846191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.352216025829175, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.26953125, + "learning_rate": 0.0009989138005706273, + "loss": 0.0735, + "macro_f1": 0.32098764181137085, + "num_tokens": 1153195.0, + "repeat_count": 0.0, + "routers_loss": 0.09899546951055527, + "skip_count": 2.0, + "step": 714, + "text_loss": 0.20803412795066833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.000998893314256908, + "loss": 0.064, + "macro_f1": 0.3333333432674408, + "num_tokens": 1157081.0, + "repeat_count": 0.0, + "routers_loss": 0.010492355562746525, + "skip_count": 0.0, + "step": 716, + "text_loss": 0.23077639937400818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3710008805400644, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009988726367663298, + "loss": 0.0539, + "macro_f1": 0.3333333432674408, + "num_tokens": 1160079.0, + "repeat_count": 0.0, + "routers_loss": 0.01063773687928915, + "skip_count": 0.0, + "step": 718, + "text_loss": 0.6085864901542664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3803933078955093, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009988517681068163, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1163249.0, + "repeat_count": 1.0, + "routers_loss": 0.05981874838471413, + "skip_count": 0.0, + "step": 720, + "text_loss": 0.4047050476074219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.3897857352509537, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009988307082863638, + "loss": 0.0361, + "macro_f1": 0.3333333432674408, + "num_tokens": 1166259.0, + "repeat_count": 0.0, + "routers_loss": 0.009750043973326683, + "skip_count": 0.0, + "step": 722, + "text_loss": 0.5306474566459656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.3991781626063986, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.240234375, + "learning_rate": 0.0009988094573130434, + "loss": 0.063, + "macro_f1": 0.5359477400779724, + "num_tokens": 1168887.0, + "repeat_count": 2.0, + "routers_loss": 0.18601104617118835, + "skip_count": 2.0, + "step": 724, + "text_loss": 0.53528892993927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009987880151949974, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1172625.0, + "repeat_count": 0.0, + "routers_loss": 0.02845010720193386, + "skip_count": 1.0, + "step": 726, + "text_loss": 0.4760453701019287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.417963017317288, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.2177734375, + "learning_rate": 0.0009987663819404434, + "loss": 0.06, + "macro_f1": 0.5492662787437439, + "num_tokens": 1176580.0, + "repeat_count": 0.0, + "routers_loss": 0.017596980556845665, + "skip_count": 2.0, + "step": 728, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.427355444672733, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000998744557557671, + "loss": 0.0484, + "macro_f1": 0.3272727429866791, + "num_tokens": 1179804.0, + "repeat_count": 0.0, + "routers_loss": 0.0625474750995636, + "skip_count": 1.0, + "step": 730, + "text_loss": 0.27738022804260254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.436747872028177, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009987225420550433, + "loss": 0.0796, + "macro_f1": 0.307692289352417, + "num_tokens": 1182658.0, + "repeat_count": 1.0, + "routers_loss": 0.16188351809978485, + "skip_count": 2.0, + "step": 732, + "text_loss": 0.23231445252895355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009987003354409965, + "loss": 0.0626, + "macro_f1": 0.3272727429866791, + "num_tokens": 1185451.0, + "repeat_count": 0.0, + "routers_loss": 0.02391529455780983, + "skip_count": 0.0, + "step": 734, + "text_loss": 0.4496627151966095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.4555327267390665, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.234375, + "learning_rate": 0.0009986779377240405, + "loss": 0.0513, + "macro_f1": 0.32098767161369324, + "num_tokens": 1188666.0, + "repeat_count": 0.0, + "routers_loss": 0.08435963839292526, + "skip_count": 1.0, + "step": 736, + "text_loss": 0.4950787127017975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4649251540945114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1220703125, + "learning_rate": 0.000998655348912758, + "loss": 0.0515, + "macro_f1": 0.3333333432674408, + "num_tokens": 1193035.0, + "repeat_count": 0.0, + "routers_loss": 0.01648722216486931, + "skip_count": 0.0, + "step": 738, + "text_loss": 0.24761848151683807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.0009986325690158051, + "loss": 0.0435, + "macro_f1": 0.3333333432674408, + "num_tokens": 1196840.0, + "repeat_count": 0.0, + "routers_loss": 0.013143910095095634, + "skip_count": 0.0, + "step": 740, + "text_loss": 0.15662719309329987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.4837100088054007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009986095980419113, + "loss": 0.0757, + "macro_f1": 0.3333333432674408, + "num_tokens": 1200573.0, + "repeat_count": 0.0, + "routers_loss": 0.026706280186772346, + "skip_count": 0.0, + "step": 742, + "text_loss": 0.16725164651870728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.493102436160845, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1982421875, + "learning_rate": 0.0009985864359998787, + "loss": 0.0795, + "macro_f1": 0.3006536364555359, + "num_tokens": 1203589.0, + "repeat_count": 2.0, + "routers_loss": 0.28607678413391113, + "skip_count": 3.0, + "step": 744, + "text_loss": 0.6350882053375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009985630828985835, + "loss": 0.0572, + "macro_f1": 0.3272727429866791, + "num_tokens": 1206422.0, + "repeat_count": 0.0, + "routers_loss": 0.05685260891914368, + "skip_count": 1.0, + "step": 746, + "text_loss": 0.33779552578926086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.5118872908717345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009985395387469742, + "loss": 0.0458, + "macro_f1": 0.5492662787437439, + "num_tokens": 1211588.0, + "repeat_count": 0.0, + "routers_loss": 0.0437830351293087, + "skip_count": 2.0, + "step": 748, + "text_loss": 0.28664472699165344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5212797182271793, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009985158035540735, + "loss": 0.0714, + "macro_f1": 0.32098764181137085, + "num_tokens": 1214580.0, + "repeat_count": 2.0, + "routers_loss": 0.07074898481369019, + "skip_count": 0.0, + "step": 750, + "text_loss": 0.3939313292503357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009984918773289762, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1217388.0, + "repeat_count": 0.0, + "routers_loss": 0.009757856838405132, + "skip_count": 0.0, + "step": 752, + "text_loss": 0.37641215324401855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5400645729380686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009984677600808512, + "loss": 0.054, + "macro_f1": 0.3333333432674408, + "num_tokens": 1219960.0, + "repeat_count": 0.0, + "routers_loss": 0.02515069581568241, + "skip_count": 0.0, + "step": 754, + "text_loss": 0.155938982963562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5494570002935135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 0.0009984434518189405, + "loss": 0.0764, + "macro_f1": 0.3333333432674408, + "num_tokens": 1223234.0, + "repeat_count": 0.0, + "routers_loss": 0.025766927748918533, + "skip_count": 0.0, + "step": 756, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 3.558849427648958, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009984189525525584, + "loss": 0.0451, + "macro_f1": 0.5359477400779724, + "num_tokens": 1225764.0, + "repeat_count": 2.0, + "routers_loss": 0.1782722771167755, + "skip_count": 2.0, + "step": 758, + "text_loss": 0.3592209219932556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009983942622910935, + "loss": 0.0659, + "macro_f1": 0.3333333432674408, + "num_tokens": 1230097.0, + "repeat_count": 0.0, + "routers_loss": 0.00825568474829197, + "skip_count": 0.0, + "step": 760, + "text_loss": 0.4646475315093994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5776342823598473, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009983693810440074, + "loss": 0.0477, + "macro_f1": 0.32098764181137085, + "num_tokens": 1233140.0, + "repeat_count": 0.0, + "routers_loss": 0.04156976938247681, + "skip_count": 2.0, + "step": 762, + "text_loss": 0.298682302236557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.3515625, + "learning_rate": 0.000998344308820834, + "loss": 0.0666, + "macro_f1": 0.3272727429866791, + "num_tokens": 1236305.0, + "repeat_count": 0.0, + "routers_loss": 0.05697929114103317, + "skip_count": 1.0, + "step": 764, + "text_loss": 0.5249121189117432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.5964191370707366, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.18359375, + "learning_rate": 0.0009983190456311817, + "loss": 0.0592, + "macro_f1": 0.3144654333591461, + "num_tokens": 1239673.0, + "repeat_count": 0.0, + "routers_loss": 0.09547408670186996, + "skip_count": 3.0, + "step": 766, + "text_loss": 0.41277334094047546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 3.6058115644261814, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.185546875, + "learning_rate": 0.000998293591484731, + "loss": 0.0484, + "macro_f1": 0.5492662787437439, + "num_tokens": 1242292.0, + "repeat_count": 0.0, + "routers_loss": 0.030693158507347107, + "skip_count": 2.0, + "step": 768, + "text_loss": 0.1583656519651413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000998267946391236, + "loss": 0.051, + "macro_f1": 0.3333333432674408, + "num_tokens": 1244661.0, + "repeat_count": 0.0, + "routers_loss": 0.01211300864815712, + "skip_count": 0.0, + "step": 770, + "text_loss": 0.4629349112510681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6245964191370708, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009982421103605238, + "loss": 0.0441, + "macro_f1": 0.32098764181137085, + "num_tokens": 1248688.0, + "repeat_count": 0.0, + "routers_loss": 0.0665968507528305, + "skip_count": 2.0, + "step": 772, + "text_loss": 0.4019293785095215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6339888464925156, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2890625, + "learning_rate": 0.000998216083402495, + "loss": 0.0613, + "macro_f1": 0.32098764181137085, + "num_tokens": 1251395.0, + "repeat_count": 0.0, + "routers_loss": 0.07186859846115112, + "skip_count": 2.0, + "step": 774, + "text_loss": 0.4659276604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.302734375, + "learning_rate": 0.0009981898655271235, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1254888.0, + "repeat_count": 0.0, + "routers_loss": 0.007823926396667957, + "skip_count": 0.0, + "step": 776, + "text_loss": 0.5160359740257263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 3.6527737012034045, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009981634567444557, + "loss": 0.0775, + "macro_f1": 0.590062141418457, + "num_tokens": 1258250.0, + "repeat_count": 3.0, + "routers_loss": 0.24624499678611755, + "skip_count": 4.0, + "step": 778, + "text_loss": 0.29319918155670166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6621661285588494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.0009981368570646115, + "loss": 0.0885, + "macro_f1": 0.3272727429866791, + "num_tokens": 1260916.0, + "repeat_count": 0.0, + "routers_loss": 0.030730176717042923, + "skip_count": 1.0, + "step": 780, + "text_loss": 0.624981164932251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6715585559142943, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009981100664977838, + "loss": 0.0699, + "macro_f1": 0.3333333432674408, + "num_tokens": 1264004.0, + "repeat_count": 0.0, + "routers_loss": 0.006829176563769579, + "skip_count": 0.0, + "step": 782, + "text_loss": 0.6137266159057617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6809509832697387, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980830850542391, + "loss": 0.058, + "macro_f1": 0.3333333432674408, + "num_tokens": 1267130.0, + "repeat_count": 0.0, + "routers_loss": 0.018471000716090202, + "skip_count": 0.0, + "step": 784, + "text_loss": 0.15213175117969513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.6903434106251836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2353515625, + "learning_rate": 0.0009980559127443166, + "loss": 0.052, + "macro_f1": 0.3333333432674408, + "num_tokens": 1271129.0, + "repeat_count": 0.0, + "routers_loss": 0.007903140969574451, + "skip_count": 0.0, + "step": 786, + "text_loss": 0.5768613219261169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.699735837980628, + "f1_execute": 0.923076868057251, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.000998028549578429, + "loss": 0.0719, + "macro_f1": 0.307692289352417, + "num_tokens": 1274232.0, + "repeat_count": 0.0, + "routers_loss": 0.06737866252660751, + "skip_count": 3.0, + "step": 788, + "text_loss": 0.2877073585987091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.709128265336073, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009980009955670615, + "loss": 0.0698, + "macro_f1": 0.3144654333591461, + "num_tokens": 1277193.0, + "repeat_count": 0.0, + "routers_loss": 0.10194934904575348, + "skip_count": 3.0, + "step": 790, + "text_loss": 0.11860492825508118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7185206926915173, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000997973250720773, + "loss": 0.0552, + "macro_f1": 0.32098764181137085, + "num_tokens": 1280960.0, + "repeat_count": 0.0, + "routers_loss": 0.10297708213329315, + "skip_count": 2.0, + "step": 792, + "text_loss": 0.13477706909179688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.727913120046962, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009979453150501954, + "loss": 0.0663, + "macro_f1": 0.32098764181137085, + "num_tokens": 1284611.0, + "repeat_count": 1.0, + "routers_loss": 0.06122037023305893, + "skip_count": 1.0, + "step": 794, + "text_loss": 0.40569379925727844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.737305547402407, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997917188566034, + "loss": 0.062, + "macro_f1": 0.32098764181137085, + "num_tokens": 1287834.0, + "repeat_count": 0.0, + "routers_loss": 0.061135001480579376, + "skip_count": 2.0, + "step": 796, + "text_loss": 0.2829287648200989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7466979747578515, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009978888712790664, + "loss": 0.0654, + "macro_f1": 0.3272727429866791, + "num_tokens": 1291666.0, + "repeat_count": 0.0, + "routers_loss": 0.04841872677206993, + "skip_count": 1.0, + "step": 798, + "text_loss": 1.011757254600525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.20000000298023224, + "avg_layers": 27.0, + "epoch": 3.756090402113296, + "f1_execute": 0.8979591727256775, + "f1_repeat": 0.0, + "f1_skip": 0.3333333134651184, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978603632001444, + "loss": 0.0636, + "macro_f1": 0.4104308485984802, + "num_tokens": 1294627.0, + "repeat_count": 1.0, + "routers_loss": 0.15698759257793427, + "skip_count": 5.0, + "step": 800, + "text_loss": 0.4457623362541199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009978316643401916, + "loss": 0.0688, + "macro_f1": 0.3333333432674408, + "num_tokens": 1297711.0, + "repeat_count": 0.0, + "routers_loss": 0.018952010199427605, + "skip_count": 0.0, + "step": 802, + "text_loss": 0.2069481462240219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.7748752568241857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.14453125, + "learning_rate": 0.0009978027747102062, + "loss": 0.0479, + "macro_f1": 0.3333333432674408, + "num_tokens": 1300569.0, + "repeat_count": 0.0, + "routers_loss": 0.014538386836647987, + "skip_count": 0.0, + "step": 804, + "text_loss": 0.4983852505683899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2109375, + "learning_rate": 0.0009977736943212584, + "loss": 0.0721, + "macro_f1": 0.32098764181137085, + "num_tokens": 1303969.0, + "repeat_count": 0.0, + "routers_loss": 0.11164087057113647, + "skip_count": 2.0, + "step": 806, + "text_loss": 0.2910642921924591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.793660111535075, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.000997744423184492, + "loss": 0.0424, + "macro_f1": 0.3272727429866791, + "num_tokens": 1307263.0, + "repeat_count": 0.0, + "routers_loss": 0.06073406711220741, + "skip_count": 1.0, + "step": 808, + "text_loss": 0.18831779062747955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 3.8030525388905194, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.26171875, + "learning_rate": 0.0009977149613111236, + "loss": 0.0486, + "macro_f1": 0.4400000274181366, + "num_tokens": 1309953.0, + "repeat_count": 1.0, + "routers_loss": 0.11035524308681488, + "skip_count": 4.0, + "step": 810, + "text_loss": 0.7872759699821472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8124449662459643, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009976853087124433, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1313243.0, + "repeat_count": 0.0, + "routers_loss": 0.021804286167025566, + "skip_count": 0.0, + "step": 812, + "text_loss": 0.22349292039871216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.8218373936014087, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28125, + "learning_rate": 0.0009976554653998138, + "loss": 0.0612, + "macro_f1": 0.31446540355682373, + "num_tokens": 1316165.0, + "repeat_count": 0.0, + "routers_loss": 0.10715524107217789, + "skip_count": 2.0, + "step": 814, + "text_loss": 0.18035532534122467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8312298209568536, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000997625431384671, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1319206.0, + "repeat_count": 0.0, + "routers_loss": 0.007173649035394192, + "skip_count": 0.0, + "step": 816, + "text_loss": 0.48928648233413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8406222483122985, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009975952066785243, + "loss": 0.0655, + "macro_f1": 0.3006536364555359, + "num_tokens": 1322549.0, + "repeat_count": 1.0, + "routers_loss": 0.22308112680912018, + "skip_count": 4.0, + "step": 818, + "text_loss": 0.5211259722709656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009975647912929557, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1325213.0, + "repeat_count": 0.0, + "routers_loss": 0.00998698640614748, + "skip_count": 0.0, + "step": 820, + "text_loss": 0.7117052674293518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8594071030231873, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.0009975341852396205, + "loss": 0.0723, + "macro_f1": 0.32098764181137085, + "num_tokens": 1328383.0, + "repeat_count": 0.0, + "routers_loss": 0.07454588264226913, + "skip_count": 2.0, + "step": 822, + "text_loss": 0.34539610147476196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8687995303786322, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009975033885302469, + "loss": 0.0604, + "macro_f1": 0.3333333432674408, + "num_tokens": 1331406.0, + "repeat_count": 0.0, + "routers_loss": 0.009157589636743069, + "skip_count": 0.0, + "step": 824, + "text_loss": 0.7484824657440186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.878191957734077, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009974724011766363, + "loss": 0.0474, + "macro_f1": 0.3272727429866791, + "num_tokens": 1334410.0, + "repeat_count": 1.0, + "routers_loss": 0.17149391770362854, + "skip_count": 0.0, + "step": 826, + "text_loss": 0.5913820266723633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8875843850895215, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009974412231906632, + "loss": 0.058, + "macro_f1": 0.32098764181137085, + "num_tokens": 1337653.0, + "repeat_count": 1.0, + "routers_loss": 0.09743282198905945, + "skip_count": 1.0, + "step": 828, + "text_loss": 0.2505693733692169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.8969768124449664, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009974098545842748, + "loss": 0.0638, + "macro_f1": 0.3272727429866791, + "num_tokens": 1340860.0, + "repeat_count": 0.0, + "routers_loss": 0.041490405797958374, + "skip_count": 1.0, + "step": 830, + "text_loss": 0.5585370063781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 3.906369239800411, + "f1_execute": 0.9019607901573181, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009973782953694918, + "loss": 0.0746, + "macro_f1": 0.3006536066532135, + "num_tokens": 1344232.0, + "repeat_count": 1.0, + "routers_loss": 0.16080693900585175, + "skip_count": 3.0, + "step": 832, + "text_loss": 0.4782734513282776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9157616671558557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000997346545558408, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1347667.0, + "repeat_count": 0.0, + "routers_loss": 0.01173500344157219, + "skip_count": 0.0, + "step": 834, + "text_loss": 0.25036177039146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.0009973146051631895, + "loss": 0.0522, + "macro_f1": 0.3333333432674408, + "num_tokens": 1350707.0, + "repeat_count": 0.0, + "routers_loss": 0.011477196589112282, + "skip_count": 0.0, + "step": 836, + "text_loss": 0.5482863187789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1650390625, + "learning_rate": 0.0009972824741960764, + "loss": 0.0536, + "macro_f1": 0.3333333432674408, + "num_tokens": 1353704.0, + "repeat_count": 0.0, + "routers_loss": 0.010528896935284138, + "skip_count": 0.0, + "step": 838, + "text_loss": 0.6732596158981323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9439389492221895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.000997250152669381, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1356608.0, + "repeat_count": 0.0, + "routers_loss": 0.010678744874894619, + "skip_count": 0.0, + "step": 840, + "text_loss": 0.5479338765144348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9533313765776343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.181640625, + "learning_rate": 0.000997217640595489, + "loss": 0.0631, + "macro_f1": 0.3333333432674408, + "num_tokens": 1359809.0, + "repeat_count": 0.0, + "routers_loss": 0.00835978239774704, + "skip_count": 0.0, + "step": 842, + "text_loss": 0.42543259263038635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9627238039330788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009971849379868593, + "loss": 0.0653, + "macro_f1": 0.3333333432674408, + "num_tokens": 1362201.0, + "repeat_count": 0.0, + "routers_loss": 0.009930923581123352, + "skip_count": 0.0, + "step": 844, + "text_loss": 0.720462441444397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.9721162312885236, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009971520448560235, + "loss": 0.0615, + "macro_f1": 0.3272727429866791, + "num_tokens": 1365790.0, + "repeat_count": 0.0, + "routers_loss": 0.06344373524188995, + "skip_count": 1.0, + "step": 846, + "text_loss": 0.8423607349395752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 3.9815086586439685, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.16796875, + "learning_rate": 0.000997118961215586, + "loss": 0.0674, + "macro_f1": 0.4533333480358124, + "num_tokens": 1368387.0, + "repeat_count": 1.0, + "routers_loss": 0.14688406884670258, + "skip_count": 3.0, + "step": 848, + "text_loss": 0.3933577537536621 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 3.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.263671875, + "learning_rate": 0.000997085687078225, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1371189.0, + "repeat_count": 0.0, + "routers_loss": 0.009953443892300129, + "skip_count": 0.0, + "step": 850, + "text_loss": 0.41469162702560425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.0, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009970522224566909, + "loss": 0.0555, + "macro_f1": 0.32098767161369324, + "num_tokens": 1374008.0, + "repeat_count": 0.0, + "routers_loss": 0.048870690166950226, + "skip_count": 1.0, + "step": 852, + "text_loss": 0.613615870475769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.009392427355444, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.283203125, + "learning_rate": 0.0009970185673638075, + "loss": 0.0629, + "macro_f1": 0.32098764181137085, + "num_tokens": 1376662.0, + "repeat_count": 1.0, + "routers_loss": 0.06865929812192917, + "skip_count": 1.0, + "step": 854, + "text_loss": 0.4392736256122589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 4.01878485471089, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.162109375, + "learning_rate": 0.0009969847218124716, + "loss": 0.0506, + "macro_f1": 0.5492662787437439, + "num_tokens": 1380049.0, + "repeat_count": 0.0, + "routers_loss": 0.02382219396531582, + "skip_count": 1.0, + "step": 856, + "text_loss": 0.19115346670150757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.028177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009969506858156527, + "loss": 0.0344, + "macro_f1": 0.3272727429866791, + "num_tokens": 1383008.0, + "repeat_count": 0.0, + "routers_loss": 0.03907281160354614, + "skip_count": 1.0, + "step": 858, + "text_loss": 0.34842637181282043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12060546875, + "learning_rate": 0.0009969164593863935, + "loss": 0.0365, + "macro_f1": 0.3333333432674408, + "num_tokens": 1387051.0, + "repeat_count": 0.0, + "routers_loss": 0.007645803038030863, + "skip_count": 0.0, + "step": 860, + "text_loss": 0.3810436725616455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.046962136777223, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009968820425378098, + "loss": 0.0463, + "macro_f1": 0.3272727429866791, + "num_tokens": 1390244.0, + "repeat_count": 1.0, + "routers_loss": 0.04435238987207413, + "skip_count": 0.0, + "step": 862, + "text_loss": 0.34853485226631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.056354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.28515625, + "learning_rate": 0.00099684743528309, + "loss": 0.0424, + "macro_f1": 0.3333333432674408, + "num_tokens": 1392976.0, + "repeat_count": 0.0, + "routers_loss": 0.006071661598980427, + "skip_count": 0.0, + "step": 864, + "text_loss": 0.6395178437232971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.065746991488113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009968126376354958, + "loss": 0.0477, + "macro_f1": 0.5492662787437439, + "num_tokens": 1396061.0, + "repeat_count": 0.0, + "routers_loss": 0.05011235550045967, + "skip_count": 2.0, + "step": 866, + "text_loss": 0.09103966504335403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.075139418843557, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009967776496083616, + "loss": 0.0509, + "macro_f1": 0.3272727429866791, + "num_tokens": 1398993.0, + "repeat_count": 1.0, + "routers_loss": 0.03979124873876572, + "skip_count": 0.0, + "step": 868, + "text_loss": 0.27257058024406433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.084531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.14453125, + "learning_rate": 0.000996742471215095, + "loss": 0.0516, + "macro_f1": 0.5492662787437439, + "num_tokens": 1402080.0, + "repeat_count": 0.0, + "routers_loss": 0.030823837965726852, + "skip_count": 2.0, + "step": 870, + "text_loss": 0.7047103047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009967071024691763, + "loss": 0.0461, + "macro_f1": 0.3333333432674408, + "num_tokens": 1404890.0, + "repeat_count": 0.0, + "routers_loss": 0.009721715934574604, + "skip_count": 0.0, + "step": 872, + "text_loss": 0.959106981754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.000996671543384159, + "loss": 0.05, + "macro_f1": 0.3333333432674408, + "num_tokens": 1407853.0, + "repeat_count": 0.0, + "routers_loss": 0.006025883834809065, + "skip_count": 0.0, + "step": 874, + "text_loss": 0.47571972012519836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.112709128265336, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009966357939736692, + "loss": 0.0416, + "macro_f1": 0.3272727429866791, + "num_tokens": 1410723.0, + "repeat_count": 0.0, + "routers_loss": 0.025964925065636635, + "skip_count": 0.0, + "step": 876, + "text_loss": 0.4964611530303955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.122101555620781, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009965998542514065, + "loss": 0.0415, + "macro_f1": 0.32098764181137085, + "num_tokens": 1414008.0, + "repeat_count": 0.0, + "routers_loss": 0.09509637206792831, + "skip_count": 2.0, + "step": 878, + "text_loss": 0.621494710445404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.131493982976226, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009965637242311427, + "loss": 0.0472, + "macro_f1": 0.542222261428833, + "num_tokens": 1417447.0, + "repeat_count": 0.0, + "routers_loss": 0.02520318515598774, + "skip_count": 4.0, + "step": 880, + "text_loss": 0.40209758281707764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 4.14088641033167, + "f1_execute": 0.936170220375061, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.263671875, + "learning_rate": 0.000996527403926723, + "loss": 0.0495, + "macro_f1": 0.5342789888381958, + "num_tokens": 1419905.0, + "repeat_count": 0.0, + "routers_loss": 0.13183781504631042, + "skip_count": 6.0, + "step": 882, + "text_loss": 0.642185389995575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.1502788376871145, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009964908933520655, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 1423436.0, + "repeat_count": 0.0, + "routers_loss": 0.009429510682821274, + "skip_count": 0.0, + "step": 884, + "text_loss": 0.48232755064964294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.15967126504256, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009964541925211613, + "loss": 0.0349, + "macro_f1": 0.32098764181137085, + "num_tokens": 1426842.0, + "repeat_count": 0.0, + "routers_loss": 0.07629609107971191, + "skip_count": 2.0, + "step": 886, + "text_loss": 0.16620934009552002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.169063692398004, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009964173014480738, + "loss": 0.0348, + "macro_f1": 0.5492662787437439, + "num_tokens": 1430430.0, + "repeat_count": 0.0, + "routers_loss": 0.036814019083976746, + "skip_count": 2.0, + "step": 888, + "text_loss": 0.4866008758544922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009963802201469398, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1433821.0, + "repeat_count": 0.0, + "routers_loss": 0.0041250260546803474, + "skip_count": 0.0, + "step": 890, + "text_loss": 0.578216552734375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.187848547108893, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2373046875, + "learning_rate": 0.0009963429486319693, + "loss": 0.0463, + "macro_f1": 0.32098764181137085, + "num_tokens": 1436976.0, + "repeat_count": 0.0, + "routers_loss": 0.06213559955358505, + "skip_count": 2.0, + "step": 892, + "text_loss": 0.221701517701149 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 4.197240974464338, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.361328125, + "learning_rate": 0.0009963054869174446, + "loss": 0.0313, + "macro_f1": 0.4871794879436493, + "num_tokens": 1440397.0, + "repeat_count": 0.0, + "routers_loss": 0.07532428950071335, + "skip_count": 2.0, + "step": 894, + "text_loss": 0.6922838091850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.206633401819783, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009962678350177209, + "loss": 0.0472, + "macro_f1": 0.3272727429866791, + "num_tokens": 1443604.0, + "repeat_count": 0.0, + "routers_loss": 0.0419243648648262, + "skip_count": 1.0, + "step": 896, + "text_loss": 0.22092342376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.216025829175227, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009962299929472268, + "loss": 0.034, + "macro_f1": 0.32098764181137085, + "num_tokens": 1446257.0, + "repeat_count": 2.0, + "routers_loss": 0.10849297791719437, + "skip_count": 0.0, + "step": 898, + "text_loss": 0.26394811272621155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.000996191960720463, + "loss": 0.0394, + "macro_f1": 0.3333333432674408, + "num_tokens": 1449669.0, + "repeat_count": 0.0, + "routers_loss": 0.0092767970636487, + "skip_count": 0.0, + "step": 900, + "text_loss": 0.5338577628135681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.234810683886117, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009961537383520042, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1452450.0, + "repeat_count": 1.0, + "routers_loss": 0.02985367365181446, + "skip_count": 0.0, + "step": 902, + "text_loss": 0.5875228047370911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.2442031112415615, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009961153258564966, + "loss": 0.0378, + "macro_f1": 0.3144654333591461, + "num_tokens": 1456909.0, + "repeat_count": 0.0, + "routers_loss": 0.06794842332601547, + "skip_count": 3.0, + "step": 904, + "text_loss": 0.40959444642066956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009960767232486604, + "loss": 0.0476, + "macro_f1": 0.3333333432674408, + "num_tokens": 1461712.0, + "repeat_count": 0.0, + "routers_loss": 0.0023562447167932987, + "skip_count": 0.0, + "step": 906, + "text_loss": 0.3932875096797943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000996037930543288, + "loss": 0.0505, + "macro_f1": 0.3272727429866791, + "num_tokens": 1464817.0, + "repeat_count": 0.0, + "routers_loss": 0.03880339860916138, + "skip_count": 1.0, + "step": 908, + "text_loss": 0.17482402920722961 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.272380393307896, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.000995998947755245, + "loss": 0.0479, + "macro_f1": 0.3272727429866791, + "num_tokens": 1467810.0, + "repeat_count": 0.0, + "routers_loss": 0.01736828312277794, + "skip_count": 1.0, + "step": 910, + "text_loss": 0.4140470325946808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009959597748994695, + "loss": 0.0752, + "macro_f1": 0.3333333432674408, + "num_tokens": 1470802.0, + "repeat_count": 0.0, + "routers_loss": 0.011824851855635643, + "skip_count": 0.0, + "step": 912, + "text_loss": 0.7153383493423462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.2911652480187845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009959204119909726, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1474539.0, + "repeat_count": 0.0, + "routers_loss": 0.025456594303250313, + "skip_count": 0.0, + "step": 914, + "text_loss": 0.42812058329582214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009958808590448385, + "loss": 0.0489, + "macro_f1": 0.3333333432674408, + "num_tokens": 1477552.0, + "repeat_count": 0.0, + "routers_loss": 0.006795851048082113, + "skip_count": 0.0, + "step": 916, + "text_loss": 0.5402814149856567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009958411160762234, + "loss": 0.039, + "macro_f1": 0.3333333432674408, + "num_tokens": 1482547.0, + "repeat_count": 0.0, + "routers_loss": 0.015615932643413544, + "skip_count": 0.0, + "step": 918, + "text_loss": 0.3836168050765991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.319342530085119, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009958011831003577, + "loss": 0.0448, + "macro_f1": 0.3272727429866791, + "num_tokens": 1485807.0, + "repeat_count": 0.0, + "routers_loss": 0.043541423976421356, + "skip_count": 1.0, + "step": 920, + "text_loss": 0.4333936274051666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.328734957440563, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1337890625, + "learning_rate": 0.000995761060132543, + "loss": 0.0418, + "macro_f1": 0.6538461446762085, + "num_tokens": 1488941.0, + "repeat_count": 1.0, + "routers_loss": 0.05866432189941406, + "skip_count": 2.0, + "step": 922, + "text_loss": 0.4106994867324829 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.3381273847960085, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009957207471881552, + "loss": 0.0531, + "macro_f1": 0.5492662787437439, + "num_tokens": 1492026.0, + "repeat_count": 0.0, + "routers_loss": 0.02714901603758335, + "skip_count": 2.0, + "step": 924, + "text_loss": 0.542091429233551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.347519812151453, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009956802442826415, + "loss": 0.0386, + "macro_f1": 0.3272727429866791, + "num_tokens": 1494543.0, + "repeat_count": 1.0, + "routers_loss": 0.0563737191259861, + "skip_count": 0.0, + "step": 926, + "text_loss": 0.47209203243255615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.356912239506897, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009956395514315235, + "loss": 0.0496, + "macro_f1": 0.3272727429866791, + "num_tokens": 1497831.0, + "repeat_count": 1.0, + "routers_loss": 0.03285066783428192, + "skip_count": 0.0, + "step": 928, + "text_loss": 0.6628931164741516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.366304666862343, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009955986686503943, + "loss": 0.0466, + "macro_f1": 0.3272727429866791, + "num_tokens": 1501375.0, + "repeat_count": 0.0, + "routers_loss": 0.024297121912240982, + "skip_count": 1.0, + "step": 930, + "text_loss": 0.495676189661026 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 4.375697094217787, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009955575959549202, + "loss": 0.0424, + "macro_f1": 0.7795917987823486, + "num_tokens": 1504363.0, + "repeat_count": 1.0, + "routers_loss": 0.12196464836597443, + "skip_count": 4.0, + "step": 932, + "text_loss": 0.26123273372650146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.0009955163333608408, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1507178.0, + "repeat_count": 0.0, + "routers_loss": 0.012947078794240952, + "skip_count": 0.0, + "step": 934, + "text_loss": 0.32552677392959595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009954748808839674, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 1509910.0, + "repeat_count": 0.0, + "routers_loss": 0.008946365676820278, + "skip_count": 0.0, + "step": 936, + "text_loss": 0.533141016960144 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.403874376284121, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000995433238540185, + "loss": 0.0466, + "macro_f1": 0.6538461446762085, + "num_tokens": 1512826.0, + "repeat_count": 1.0, + "routers_loss": 0.029975678771734238, + "skip_count": 1.0, + "step": 938, + "text_loss": 0.2953577935695648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.413266803639566, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009953914063454512, + "loss": 0.0497, + "macro_f1": 0.3144654333591461, + "num_tokens": 1517230.0, + "repeat_count": 1.0, + "routers_loss": 0.0889134630560875, + "skip_count": 2.0, + "step": 940, + "text_loss": 0.5368834733963013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.000995349384315796, + "loss": 0.0413, + "macro_f1": 0.3333333432674408, + "num_tokens": 1519876.0, + "repeat_count": 0.0, + "routers_loss": 0.013458753935992718, + "skip_count": 0.0, + "step": 942, + "text_loss": 0.2005518227815628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.432051658350455, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.000995307172467322, + "loss": 0.0444, + "macro_f1": 0.31446540355682373, + "num_tokens": 1522998.0, + "repeat_count": 1.0, + "routers_loss": 0.08850377053022385, + "skip_count": 1.0, + "step": 944, + "text_loss": 0.227926567196846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009952647708162054, + "loss": 0.0503, + "macro_f1": 0.3272727429866791, + "num_tokens": 1527100.0, + "repeat_count": 0.0, + "routers_loss": 0.03199794515967369, + "skip_count": 1.0, + "step": 946, + "text_loss": 0.4859686493873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009952221793786942, + "loss": 0.0354, + "macro_f1": 0.3333333432674408, + "num_tokens": 1530028.0, + "repeat_count": 0.0, + "routers_loss": 0.006507779937237501, + "skip_count": 0.0, + "step": 948, + "text_loss": 0.6855354905128479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.460228940416789, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009951793981711097, + "loss": 0.0584, + "macro_f1": 0.6538461446762085, + "num_tokens": 1533254.0, + "repeat_count": 1.0, + "routers_loss": 0.06175103038549423, + "skip_count": 1.0, + "step": 950, + "text_loss": 0.7590400576591492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.469621367772234, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009951364272098458, + "loss": 0.0295, + "macro_f1": 0.5492662787437439, + "num_tokens": 1536239.0, + "repeat_count": 0.0, + "routers_loss": 0.03773383051156998, + "skip_count": 2.0, + "step": 952, + "text_loss": 0.669784665107727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009950932665113688, + "loss": 0.0507, + "macro_f1": 0.32098764181137085, + "num_tokens": 1539682.0, + "repeat_count": 0.0, + "routers_loss": 0.07280613481998444, + "skip_count": 2.0, + "step": 954, + "text_loss": 0.3365570902824402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009950499160922184, + "loss": 0.0541, + "macro_f1": 0.3333333432674408, + "num_tokens": 1542875.0, + "repeat_count": 0.0, + "routers_loss": 0.01770266517996788, + "skip_count": 0.0, + "step": 956, + "text_loss": 0.0921545997262001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.497798649838567, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09375, + "learning_rate": 0.000995006375969006, + "loss": 0.0473, + "macro_f1": 0.3272727429866791, + "num_tokens": 1547135.0, + "repeat_count": 1.0, + "routers_loss": 0.07672002166509628, + "skip_count": 0.0, + "step": 958, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.507191077194013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009949626461584165, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 1550100.0, + "repeat_count": 0.0, + "routers_loss": 0.006247182376682758, + "skip_count": 0.0, + "step": 960, + "text_loss": 0.5777931213378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.119140625, + "learning_rate": 0.0009949187266772076, + "loss": 0.0366, + "macro_f1": 0.5492662787437439, + "num_tokens": 1553192.0, + "repeat_count": 0.0, + "routers_loss": 0.030319908633828163, + "skip_count": 2.0, + "step": 962, + "text_loss": 0.2370252162218094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5259759319049016, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009948746175422088, + "loss": 0.0511, + "macro_f1": 0.3333333432674408, + "num_tokens": 1556318.0, + "repeat_count": 0.0, + "routers_loss": 0.006004320923238993, + "skip_count": 0.0, + "step": 964, + "text_loss": 0.6271032094955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994830318770323, + "loss": 0.0514, + "macro_f1": 0.3333333432674408, + "num_tokens": 1559195.0, + "repeat_count": 0.0, + "routers_loss": 0.011544366367161274, + "skip_count": 0.0, + "step": 966, + "text_loss": 0.47256720066070557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 4.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009947858303785255, + "loss": 0.0374, + "macro_f1": 0.6603773832321167, + "num_tokens": 1561813.0, + "repeat_count": 1.0, + "routers_loss": 0.05258861929178238, + "skip_count": 1.0, + "step": 968, + "text_loss": 0.7703132629394531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.554153213971236, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009947411523838648, + "loss": 0.0453, + "macro_f1": 0.3333333432674408, + "num_tokens": 1564634.0, + "repeat_count": 0.0, + "routers_loss": 0.011216280050575733, + "skip_count": 0.0, + "step": 970, + "text_loss": 0.4666804075241089 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0009946962848034608, + "loss": 0.0696, + "macro_f1": 0.3333333432674408, + "num_tokens": 1567959.0, + "repeat_count": 0.0, + "routers_loss": 0.009387624450027943, + "skip_count": 0.0, + "step": 972, + "text_loss": 0.4067264199256897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.5729380686821255, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.203125, + "learning_rate": 0.0009946512276545075, + "loss": 0.0397, + "macro_f1": 0.3272727429866791, + "num_tokens": 1571221.0, + "repeat_count": 1.0, + "routers_loss": 0.041713520884513855, + "skip_count": 0.0, + "step": 974, + "text_loss": 0.5242366194725037 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 4.58233049603757, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.228515625, + "learning_rate": 0.0009946059809542705, + "loss": 0.0487, + "macro_f1": 0.7644445300102234, + "num_tokens": 1575033.0, + "repeat_count": 2.0, + "routers_loss": 0.05748331546783447, + "skip_count": 2.0, + "step": 976, + "text_loss": 0.5704690217971802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 4.591722923393014, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009945605447200887, + "loss": 0.0445, + "macro_f1": 0.3272727429866791, + "num_tokens": 1579050.0, + "repeat_count": 0.0, + "routers_loss": 0.016765203326940536, + "skip_count": 0.0, + "step": 978, + "text_loss": 0.4804173707962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.601115350748459, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009945149189693732, + "loss": 0.0406, + "macro_f1": 0.5492662787437439, + "num_tokens": 1582967.0, + "repeat_count": 0.0, + "routers_loss": 0.021518222987651825, + "skip_count": 2.0, + "step": 980, + "text_loss": 0.4138598144054413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.610507778103904, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009944691037196078, + "loss": 0.0456, + "macro_f1": 0.3333333432674408, + "num_tokens": 1586282.0, + "repeat_count": 0.0, + "routers_loss": 0.012246460653841496, + "skip_count": 0.0, + "step": 982, + "text_loss": 0.22561736404895782 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 4.6199002054593485, + "f1_execute": 0.930232584476471, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009944230989883491, + "loss": 0.0456, + "macro_f1": 0.7989664077758789, + "num_tokens": 1589279.0, + "repeat_count": 2.0, + "routers_loss": 0.09344895929098129, + "skip_count": 5.0, + "step": 984, + "text_loss": 0.4416656494140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.629292632814793, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.111328125, + "learning_rate": 0.0009943769047932264, + "loss": 0.0404, + "macro_f1": 0.5359477400779724, + "num_tokens": 1592398.0, + "repeat_count": 2.0, + "routers_loss": 0.08916857838630676, + "skip_count": 2.0, + "step": 986, + "text_loss": 0.5536438822746277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.638685060170237, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15234375, + "learning_rate": 0.000994330521151941, + "loss": 0.039, + "macro_f1": 0.32098764181137085, + "num_tokens": 1596213.0, + "repeat_count": 1.0, + "routers_loss": 0.06114347651600838, + "skip_count": 1.0, + "step": 988, + "text_loss": 0.5835405588150024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.000994283948082267, + "loss": 0.0573, + "macro_f1": 0.3333333432674408, + "num_tokens": 1598827.0, + "repeat_count": 0.0, + "routers_loss": 0.0017335431184619665, + "skip_count": 0.0, + "step": 990, + "text_loss": 0.5857380032539368 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.657469914881127, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009942371856020522, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1602915.0, + "repeat_count": 0.0, + "routers_loss": 0.014606470242142677, + "skip_count": 0.0, + "step": 992, + "text_loss": 0.6939892768859863 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 4.666862342236572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009941902337292155, + "loss": 0.06, + "macro_f1": 0.6598639488220215, + "num_tokens": 1605776.0, + "repeat_count": 3.0, + "routers_loss": 0.06297315657138824, + "skip_count": 1.0, + "step": 994, + "text_loss": 0.37616831064224243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.676254769592017, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009941430924817487, + "loss": 0.0572, + "macro_f1": 0.5492662787437439, + "num_tokens": 1609856.0, + "repeat_count": 0.0, + "routers_loss": 0.03297794610261917, + "skip_count": 2.0, + "step": 996, + "text_loss": 0.2098303586244583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.685647196947461, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000994095761877717, + "loss": 0.0499, + "macro_f1": 0.3333333432674408, + "num_tokens": 1612904.0, + "repeat_count": 0.0, + "routers_loss": 0.012901155278086662, + "skip_count": 0.0, + "step": 998, + "text_loss": 0.20103533565998077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.259765625, + "learning_rate": 0.000994048241935257, + "loss": 0.0535, + "macro_f1": 0.3272727429866791, + "num_tokens": 1615540.0, + "repeat_count": 0.0, + "routers_loss": 0.020434845238924026, + "skip_count": 0.0, + "step": 1000, + "text_loss": 0.32709044218063354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.70443205165835, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1669921875, + "learning_rate": 0.0009940005326725789, + "loss": 0.0453, + "macro_f1": 0.32098764181137085, + "num_tokens": 1618786.0, + "repeat_count": 0.0, + "routers_loss": 0.07831378281116486, + "skip_count": 2.0, + "step": 1002, + "text_loss": 0.5789632797241211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21875, + "learning_rate": 0.0009939526341079647, + "loss": 0.0511, + "macro_f1": 0.32098764181137085, + "num_tokens": 1621736.0, + "repeat_count": 2.0, + "routers_loss": 0.04863874986767769, + "skip_count": 0.0, + "step": 1004, + "text_loss": 0.6128849387168884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009939045462597693, + "loss": 0.0538, + "macro_f1": 0.3333333432674408, + "num_tokens": 1624649.0, + "repeat_count": 0.0, + "routers_loss": 0.00677989237010479, + "skip_count": 0.0, + "step": 1006, + "text_loss": 0.6168264150619507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.732609333724684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009938562691464202, + "loss": 0.0524, + "macro_f1": 0.3333333432674408, + "num_tokens": 1627700.0, + "repeat_count": 0.0, + "routers_loss": 0.019490402191877365, + "skip_count": 0.0, + "step": 1008, + "text_loss": 0.17463822662830353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.000993807802786417, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1630714.0, + "repeat_count": 0.0, + "routers_loss": 0.0019022391643375158, + "skip_count": 0.0, + "step": 1010, + "text_loss": 0.5675593018531799 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 4.751394188435574, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1640625, + "learning_rate": 0.0009937591471983322, + "loss": 0.0501, + "macro_f1": 0.7644444704055786, + "num_tokens": 1633770.0, + "repeat_count": 1.0, + "routers_loss": 0.042485643178224564, + "skip_count": 2.0, + "step": 1012, + "text_loss": 0.42387229204177856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.760786615791019, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009937103024008109, + "loss": 0.0545, + "macro_f1": 0.3272727429866791, + "num_tokens": 1637120.0, + "repeat_count": 0.0, + "routers_loss": 0.09427817165851593, + "skip_count": 1.0, + "step": 1014, + "text_loss": 0.49511051177978516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009936612684125702, + "loss": 0.0503, + "macro_f1": 0.3333333432674408, + "num_tokens": 1640165.0, + "repeat_count": 0.0, + "routers_loss": 0.005106127820909023, + "skip_count": 0.0, + "step": 1016, + "text_loss": 0.5398799180984497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2734375, + "learning_rate": 0.0009936120452524004, + "loss": 0.0506, + "macro_f1": 0.3333333432674408, + "num_tokens": 1643251.0, + "repeat_count": 0.0, + "routers_loss": 0.016914300620555878, + "skip_count": 0.0, + "step": 1018, + "text_loss": 0.20882178843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.788963897857353, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1962890625, + "learning_rate": 0.0009935626329391637, + "loss": 0.0537, + "macro_f1": 0.32098764181137085, + "num_tokens": 1646560.0, + "repeat_count": 0.0, + "routers_loss": 0.13481520116329193, + "skip_count": 2.0, + "step": 1020, + "text_loss": 0.5719883441925049 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 4.798356325212797, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009935130314917948, + "loss": 0.0602, + "macro_f1": 0.5492662787437439, + "num_tokens": 1649538.0, + "repeat_count": 0.0, + "routers_loss": 0.07700438797473907, + "skip_count": 2.0, + "step": 1022, + "text_loss": 0.1303367167711258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.807748752568242, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009934632409293015, + "loss": 0.0611, + "macro_f1": 0.32098764181137085, + "num_tokens": 1652397.0, + "repeat_count": 1.0, + "routers_loss": 0.11416907608509064, + "skip_count": 1.0, + "step": 1024, + "text_loss": 0.24076920747756958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.817141179923686, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.306640625, + "learning_rate": 0.0009934132612707631, + "loss": 0.0507, + "macro_f1": 0.31446540355682373, + "num_tokens": 1654938.0, + "repeat_count": 0.0, + "routers_loss": 0.09484589844942093, + "skip_count": 2.0, + "step": 1026, + "text_loss": 0.1652517318725586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009933630925353324, + "loss": 0.0395, + "macro_f1": 0.3333333432674408, + "num_tokens": 1658536.0, + "repeat_count": 0.0, + "routers_loss": 0.00741987070068717, + "skip_count": 0.0, + "step": 1028, + "text_loss": 0.49296700954437256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.835926034634576, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1845703125, + "learning_rate": 0.0009933127347422337, + "loss": 0.0602, + "macro_f1": 0.32098764181137085, + "num_tokens": 1661446.0, + "repeat_count": 0.0, + "routers_loss": 0.08399344235658646, + "skip_count": 2.0, + "step": 1030, + "text_loss": 0.22363591194152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.158203125, + "learning_rate": 0.0009932621879107648, + "loss": 0.0475, + "macro_f1": 0.3333333432674408, + "num_tokens": 1664612.0, + "repeat_count": 0.0, + "routers_loss": 0.0031781597062945366, + "skip_count": 0.0, + "step": 1032, + "text_loss": 0.36083245277404785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.854710889345466, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2275390625, + "learning_rate": 0.000993211452060295, + "loss": 0.042, + "macro_f1": 0.3272727429866791, + "num_tokens": 1667467.0, + "repeat_count": 0.0, + "routers_loss": 0.03595469892024994, + "skip_count": 1.0, + "step": 1034, + "text_loss": 0.16372856497764587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.86410331670091, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.189453125, + "learning_rate": 0.000993160527210266, + "loss": 0.061, + "macro_f1": 0.3144654333591461, + "num_tokens": 1670675.0, + "repeat_count": 3.0, + "routers_loss": 0.1597205102443695, + "skip_count": 0.0, + "step": 1036, + "text_loss": 0.6049913763999939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2197265625, + "learning_rate": 0.000993109413380193, + "loss": 0.0562, + "macro_f1": 0.3333333432674408, + "num_tokens": 1673477.0, + "repeat_count": 0.0, + "routers_loss": 0.009756010957062244, + "skip_count": 0.0, + "step": 1038, + "text_loss": 0.7034620642662048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 4.882888171411799, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.0009930581105896624, + "loss": 0.0559, + "macro_f1": 0.3272727429866791, + "num_tokens": 1676809.0, + "repeat_count": 0.0, + "routers_loss": 0.020718922838568687, + "skip_count": 0.0, + "step": 1040, + "text_loss": 0.2814720571041107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.892280598767244, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1923828125, + "learning_rate": 0.0009930066188583338, + "loss": 0.0445, + "macro_f1": 0.32098764181137085, + "num_tokens": 1679398.0, + "repeat_count": 1.0, + "routers_loss": 0.04755603149533272, + "skip_count": 1.0, + "step": 1042, + "text_loss": 0.5445759296417236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.0009929549382059388, + "loss": 0.0509, + "macro_f1": 0.3333333432674408, + "num_tokens": 1682269.0, + "repeat_count": 0.0, + "routers_loss": 0.01040949858725071, + "skip_count": 0.0, + "step": 1044, + "text_loss": 0.2876914143562317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.911065453478133, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009929030686522816, + "loss": 0.0363, + "macro_f1": 0.3333333432674408, + "num_tokens": 1685428.0, + "repeat_count": 0.0, + "routers_loss": 0.008158888667821884, + "skip_count": 0.0, + "step": 1046, + "text_loss": 0.49053525924682617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009928510102172386, + "loss": 0.0498, + "macro_f1": 0.3333333432674408, + "num_tokens": 1688252.0, + "repeat_count": 0.0, + "routers_loss": 0.005102572031319141, + "skip_count": 0.0, + "step": 1048, + "text_loss": 0.5274341106414795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009927987629207587, + "loss": 0.0564, + "macro_f1": 0.3333333432674408, + "num_tokens": 1691289.0, + "repeat_count": 0.0, + "routers_loss": 0.016768503934144974, + "skip_count": 0.0, + "step": 1050, + "text_loss": 0.9935035109519958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.939242735544467, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009927463267828634, + "loss": 0.0488, + "macro_f1": 0.3333333432674408, + "num_tokens": 1694148.0, + "repeat_count": 0.0, + "routers_loss": 0.010905829258263111, + "skip_count": 0.0, + "step": 1052, + "text_loss": 0.20895758271217346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.948635162899912, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.000992693701823646, + "loss": 0.0624, + "macro_f1": 0.3272727429866791, + "num_tokens": 1698543.0, + "repeat_count": 1.0, + "routers_loss": 0.10533971339464188, + "skip_count": 0.0, + "step": 1054, + "text_loss": 0.5776236653327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.958027590255357, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009926408880632726, + "loss": 0.0556, + "macro_f1": 0.3272727429866791, + "num_tokens": 1702460.0, + "repeat_count": 0.0, + "routers_loss": 0.026313411071896553, + "skip_count": 1.0, + "step": 1056, + "text_loss": 0.34990596771240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009925878855219818, + "loss": 0.0391, + "macro_f1": 0.3333333432674408, + "num_tokens": 1705686.0, + "repeat_count": 0.0, + "routers_loss": 0.007763393223285675, + "skip_count": 0.0, + "step": 1058, + "text_loss": 0.4980163276195526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.177734375, + "learning_rate": 0.000992534694220084, + "loss": 0.0613, + "macro_f1": 0.3272727429866791, + "num_tokens": 1708739.0, + "repeat_count": 0.0, + "routers_loss": 0.03998444974422455, + "skip_count": 1.0, + "step": 1060, + "text_loss": 0.29092350602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.98620487232169, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.000992481314177962, + "loss": 0.0312, + "macro_f1": 0.32098764181137085, + "num_tokens": 1711903.0, + "repeat_count": 1.0, + "routers_loss": 0.06966045498847961, + "skip_count": 1.0, + "step": 1062, + "text_loss": 0.6267179250717163 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 4.995597299677136, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.244140625, + "learning_rate": 0.0009924277454160717, + "loss": 0.0548, + "macro_f1": 0.3272727429866791, + "num_tokens": 1715974.0, + "repeat_count": 0.0, + "routers_loss": 0.05536063387989998, + "skip_count": 1.0, + "step": 1064, + "text_loss": 0.5813798904418945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009923739879549402, + "loss": 0.0423, + "macro_f1": 0.3333333432674408, + "num_tokens": 1718828.0, + "repeat_count": 0.0, + "routers_loss": 0.020993782207369804, + "skip_count": 0.0, + "step": 1066, + "text_loss": 0.22665327787399292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009923200418151677, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 1722419.0, + "repeat_count": 0.0, + "routers_loss": 0.007351701147854328, + "skip_count": 0.0, + "step": 1068, + "text_loss": 0.5796169638633728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0234810683886115, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009922659070174264, + "loss": 0.0452, + "macro_f1": 0.3272727429866791, + "num_tokens": 1725663.0, + "repeat_count": 1.0, + "routers_loss": 0.026033315807580948, + "skip_count": 0.0, + "step": 1070, + "text_loss": 0.25742828845977783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009922115835824612, + "loss": 0.041, + "macro_f1": 0.3333333432674408, + "num_tokens": 1729239.0, + "repeat_count": 0.0, + "routers_loss": 0.0118600158020854, + "skip_count": 0.0, + "step": 1072, + "text_loss": 0.21630282700061798 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009921570715310884, + "loss": 0.0364, + "macro_f1": 0.6666666865348816, + "num_tokens": 1732507.0, + "repeat_count": 1.0, + "routers_loss": 0.016118815168738365, + "skip_count": 0.0, + "step": 1074, + "text_loss": 0.5639925003051758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.051658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009921023708841974, + "loss": 0.0407, + "macro_f1": 0.3333333432674408, + "num_tokens": 1736182.0, + "repeat_count": 0.0, + "routers_loss": 0.004275390412658453, + "skip_count": 0.0, + "step": 1076, + "text_loss": 0.5758615136146545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009920474816627496, + "loss": 0.037, + "macro_f1": 0.3333333432674408, + "num_tokens": 1739559.0, + "repeat_count": 0.0, + "routers_loss": 0.01299292128533125, + "skip_count": 0.0, + "step": 1078, + "text_loss": 0.18221625685691833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.0704432051658355, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009919924038877788, + "loss": 0.0343, + "macro_f1": 0.32098764181137085, + "num_tokens": 1742890.0, + "repeat_count": 0.0, + "routers_loss": 0.038295745849609375, + "skip_count": 2.0, + "step": 1080, + "text_loss": 0.17354349792003632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 5.07983563252128, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009919371375803905, + "loss": 0.0455, + "macro_f1": 0.8194444179534912, + "num_tokens": 1746433.0, + "repeat_count": 2.0, + "routers_loss": 0.04052971675992012, + "skip_count": 3.0, + "step": 1082, + "text_loss": 0.2250112146139145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009918816827617632, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 1750802.0, + "repeat_count": 0.0, + "routers_loss": 0.009114136919379234, + "skip_count": 0.0, + "step": 1084, + "text_loss": 0.2526719272136688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.098620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.000991826039453147, + "loss": 0.0392, + "macro_f1": 0.3333333432674408, + "num_tokens": 1754272.0, + "repeat_count": 0.0, + "routers_loss": 0.004904678091406822, + "skip_count": 0.0, + "step": 1086, + "text_loss": 0.7308789491653442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 5.108012914587614, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000991770207675865, + "loss": 0.0327, + "macro_f1": 0.6666666865348816, + "num_tokens": 1757231.0, + "repeat_count": 0.0, + "routers_loss": 0.02129189297556877, + "skip_count": 2.0, + "step": 1088, + "text_loss": 0.21764220297336578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009917141874513113, + "loss": 0.0315, + "macro_f1": 0.3333333432674408, + "num_tokens": 1760003.0, + "repeat_count": 0.0, + "routers_loss": 0.01310618408024311, + "skip_count": 0.0, + "step": 1090, + "text_loss": 0.33892181515693665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.126797769298503, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.171875, + "learning_rate": 0.0009916579788009537, + "loss": 0.0457, + "macro_f1": 0.5492662787437439, + "num_tokens": 1763052.0, + "repeat_count": 0.0, + "routers_loss": 0.02059309557080269, + "skip_count": 2.0, + "step": 1092, + "text_loss": 0.6551769375801086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.136190196653947, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10546875, + "learning_rate": 0.0009916015817463312, + "loss": 0.0385, + "macro_f1": 0.5492662787437439, + "num_tokens": 1766655.0, + "repeat_count": 0.0, + "routers_loss": 0.0274797435849905, + "skip_count": 2.0, + "step": 1094, + "text_loss": 0.3984372019767761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000991544996309055, + "loss": 0.0271, + "macro_f1": 0.3333333432674408, + "num_tokens": 1769997.0, + "repeat_count": 0.0, + "routers_loss": 0.01437368243932724, + "skip_count": 0.0, + "step": 1096, + "text_loss": 0.4203338921070099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.154975051364837, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.000991488222510809, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 1773130.0, + "repeat_count": 0.0, + "routers_loss": 0.001382062560878694, + "skip_count": 0.0, + "step": 1098, + "text_loss": 0.43132516741752625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.164367478720282, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.123046875, + "learning_rate": 0.000991431260373349, + "loss": 0.0329, + "macro_f1": 0.3144654333591461, + "num_tokens": 1775682.0, + "repeat_count": 1.0, + "routers_loss": 0.1115434318780899, + "skip_count": 2.0, + "step": 1100, + "text_loss": 0.3218227028846741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.111328125, + "learning_rate": 0.000991374109918503, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 1778407.0, + "repeat_count": 0.0, + "routers_loss": 0.009529678151011467, + "skip_count": 0.0, + "step": 1102, + "text_loss": 0.17183731496334076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.183152333431171, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1142578125, + "learning_rate": 0.000991316771168171, + "loss": 0.044, + "macro_f1": 0.5492662787437439, + "num_tokens": 1781518.0, + "repeat_count": 0.0, + "routers_loss": 0.018668074160814285, + "skip_count": 2.0, + "step": 1104, + "text_loss": 1.1324785947799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.125, + "learning_rate": 0.0009912592441443258, + "loss": 0.0411, + "macro_f1": 0.3272727429866791, + "num_tokens": 1784878.0, + "repeat_count": 0.0, + "routers_loss": 0.04145100712776184, + "skip_count": 1.0, + "step": 1106, + "text_loss": 0.6082063317298889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.20193718814206, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009912015288690112, + "loss": 0.0421, + "macro_f1": 0.3272727429866791, + "num_tokens": 1788978.0, + "repeat_count": 0.0, + "routers_loss": 0.021450644358992577, + "skip_count": 1.0, + "step": 1108, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.2113296154975055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009911436253643444, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 1792321.0, + "repeat_count": 0.0, + "routers_loss": 0.017405325546860695, + "skip_count": 0.0, + "step": 1110, + "text_loss": 0.2560598850250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2294921875, + "learning_rate": 0.0009910855336525137, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1795182.0, + "repeat_count": 0.0, + "routers_loss": 0.007162237539887428, + "skip_count": 0.0, + "step": 1112, + "text_loss": 0.3438240587711334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.230114470208394, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.115234375, + "learning_rate": 0.00099102725375578, + "loss": 0.0326, + "macro_f1": 0.480392187833786, + "num_tokens": 1798987.0, + "repeat_count": 1.0, + "routers_loss": 0.11149197816848755, + "skip_count": 3.0, + "step": 1114, + "text_loss": 0.20455503463745117 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.239506897563839, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009909687856964767, + "loss": 0.035, + "macro_f1": 0.3006536364555359, + "num_tokens": 1802064.0, + "repeat_count": 2.0, + "routers_loss": 0.12679415941238403, + "skip_count": 3.0, + "step": 1116, + "text_loss": 0.11996729671955109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.248899324919284, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009909101294970082, + "loss": 0.0365, + "macro_f1": 0.5492662787437439, + "num_tokens": 1805412.0, + "repeat_count": 0.0, + "routers_loss": 0.05108053982257843, + "skip_count": 2.0, + "step": 1118, + "text_loss": 0.13224145770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.258291752274729, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0009908512851798522, + "loss": 0.0455, + "macro_f1": 0.6603773832321167, + "num_tokens": 1808196.0, + "repeat_count": 1.0, + "routers_loss": 0.02131766639649868, + "skip_count": 1.0, + "step": 1120, + "text_loss": 0.7824069261550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.0009907922527675576, + "loss": 0.0405, + "macro_f1": 0.3333333432674408, + "num_tokens": 1811622.0, + "repeat_count": 0.0, + "routers_loss": 0.006226244382560253, + "skip_count": 0.0, + "step": 1122, + "text_loss": 0.5419743061065674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.277076606985618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12890625, + "learning_rate": 0.000990733032282746, + "loss": 0.0535, + "macro_f1": 0.5492662787437439, + "num_tokens": 1814628.0, + "repeat_count": 0.0, + "routers_loss": 0.03088250942528248, + "skip_count": 2.0, + "step": 1124, + "text_loss": 0.37100958824157715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.286469034341063, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.000990673623748111, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1817205.0, + "repeat_count": 0.0, + "routers_loss": 0.05495348572731018, + "skip_count": 1.0, + "step": 1126, + "text_loss": 0.20241330564022064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.295861461696507, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0927734375, + "learning_rate": 0.0009906140271864173, + "loss": 0.0433, + "macro_f1": 0.4871794879436493, + "num_tokens": 1820141.0, + "repeat_count": 0.0, + "routers_loss": 0.037809282541275024, + "skip_count": 2.0, + "step": 1128, + "text_loss": 0.32965806126594543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.305253889051952, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009905542426205032, + "loss": 0.0348, + "macro_f1": 0.32098767161369324, + "num_tokens": 1824011.0, + "repeat_count": 0.0, + "routers_loss": 0.03320181369781494, + "skip_count": 1.0, + "step": 1130, + "text_loss": 0.36329755187034607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.314646316407397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009904942700732777, + "loss": 0.0335, + "macro_f1": 0.3333333432674408, + "num_tokens": 1826873.0, + "repeat_count": 0.0, + "routers_loss": 0.004102326463907957, + "skip_count": 0.0, + "step": 1132, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.324038743762841, + "f1_execute": 0.8799999952316284, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009904341095677226, + "loss": 0.03, + "macro_f1": 0.29333335161209106, + "num_tokens": 1830103.0, + "repeat_count": 2.0, + "routers_loss": 0.2376193106174469, + "skip_count": 4.0, + "step": 1134, + "text_loss": 0.19212862849235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.0009903737611268919, + "loss": 0.0445, + "macro_f1": 0.3333333432674408, + "num_tokens": 1833201.0, + "repeat_count": 0.0, + "routers_loss": 0.005253395065665245, + "skip_count": 0.0, + "step": 1136, + "text_loss": 0.6773360371589661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.34282359847373, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009903132247739107, + "loss": 0.0305, + "macro_f1": 0.3076923191547394, + "num_tokens": 1836045.0, + "repeat_count": 1.0, + "routers_loss": 0.14382585883140564, + "skip_count": 3.0, + "step": 1138, + "text_loss": 0.2882297933101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.3522160258291755, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.150390625, + "learning_rate": 0.0009902525005319766, + "loss": 0.04, + "macro_f1": 0.5427350401878357, + "num_tokens": 1839721.0, + "repeat_count": 1.0, + "routers_loss": 0.04033960774540901, + "skip_count": 2.0, + "step": 1140, + "text_loss": 0.7172559499740601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0009901915884243597, + "loss": 0.0351, + "macro_f1": 0.6666666865348816, + "num_tokens": 1842614.0, + "repeat_count": 1.0, + "routers_loss": 0.005162308923900127, + "skip_count": 0.0, + "step": 1142, + "text_loss": 0.42892804741859436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.371000880540064, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009901304884744014, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1845444.0, + "repeat_count": 1.0, + "routers_loss": 0.10117656737565994, + "skip_count": 2.0, + "step": 1144, + "text_loss": 0.20806430280208588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.380393307895509, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009900692007055152, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 1848558.0, + "repeat_count": 0.0, + "routers_loss": 0.014107038266956806, + "skip_count": 0.0, + "step": 1146, + "text_loss": 0.5355974435806274 + }, + { + "acc_repeat": 0.25, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 5.389785735250954, + "f1_execute": 0.9166666865348816, + "f1_repeat": 0.4000000059604645, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.16015625, + "learning_rate": 0.000990007725141187, + "loss": 0.0449, + "macro_f1": 0.6611111164093018, + "num_tokens": 1852723.0, + "repeat_count": 4.0, + "routers_loss": 0.15537866950035095, + "skip_count": 2.0, + "step": 1148, + "text_loss": 0.6388513445854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.0009899460618049741, + "loss": 0.0397, + "macro_f1": 0.3333333432674408, + "num_tokens": 1856181.0, + "repeat_count": 0.0, + "routers_loss": 0.011800912208855152, + "skip_count": 0.0, + "step": 1150, + "text_loss": 0.6113069653511047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.408570589961843, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.000989884210720506, + "loss": 0.0331, + "macro_f1": 0.6666666865348816, + "num_tokens": 1859685.0, + "repeat_count": 2.0, + "routers_loss": 0.022900646552443504, + "skip_count": 0.0, + "step": 1152, + "text_loss": 0.25718021392822266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009898221719114844, + "loss": 0.0354, + "macro_f1": 0.3272727429866791, + "num_tokens": 1862505.0, + "repeat_count": 0.0, + "routers_loss": 0.026814989745616913, + "skip_count": 1.0, + "step": 1154, + "text_loss": 0.5426549911499023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009897599454016823, + "loss": 0.0401, + "macro_f1": 0.3333333432674408, + "num_tokens": 1866266.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623792067170143, + "skip_count": 0.0, + "step": 1156, + "text_loss": 0.37752896547317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009896975312149454, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 1870216.0, + "repeat_count": 0.0, + "routers_loss": 0.015617577359080315, + "skip_count": 0.0, + "step": 1158, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009896349293751906, + "loss": 0.0423, + "macro_f1": 0.3272727429866791, + "num_tokens": 1873338.0, + "repeat_count": 0.0, + "routers_loss": 0.02250153198838234, + "skip_count": 1.0, + "step": 1160, + "text_loss": 0.548884391784668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.455532726739067, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009895721399064072, + "loss": 0.0388, + "macro_f1": 0.32098764181137085, + "num_tokens": 1876470.0, + "repeat_count": 1.0, + "routers_loss": 0.055204521864652634, + "skip_count": 1.0, + "step": 1162, + "text_loss": 0.48052409291267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.464925154094511, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009895091628326564, + "loss": 0.0293, + "macro_f1": 0.3333333432674408, + "num_tokens": 1879354.0, + "repeat_count": 0.0, + "routers_loss": 0.009093789383769035, + "skip_count": 0.0, + "step": 1164, + "text_loss": 0.3908069431781769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.474317581449956, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.000989445998178071, + "loss": 0.0323, + "macro_f1": 0.3272727429866791, + "num_tokens": 1881941.0, + "repeat_count": 0.0, + "routers_loss": 0.015086972154676914, + "skip_count": 1.0, + "step": 1166, + "text_loss": 0.4884725511074066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.4837100088054, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009893826459668558, + "loss": 0.0386, + "macro_f1": 0.3144654333591461, + "num_tokens": 1885374.0, + "repeat_count": 0.0, + "routers_loss": 0.06587666273117065, + "skip_count": 3.0, + "step": 1168, + "text_loss": 0.12760137021541595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0009893191062232873, + "loss": 0.0322, + "macro_f1": 0.3333333432674408, + "num_tokens": 1888612.0, + "repeat_count": 0.0, + "routers_loss": 0.006088624242693186, + "skip_count": 0.0, + "step": 1170, + "text_loss": 0.4821319580078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009892553789717143, + "loss": 0.0389, + "macro_f1": 0.3333333432674408, + "num_tokens": 1891463.0, + "repeat_count": 0.0, + "routers_loss": 0.010113578289747238, + "skip_count": 0.0, + "step": 1172, + "text_loss": 0.3613642454147339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009891914642365573, + "loss": 0.0404, + "macro_f1": 0.3333333432674408, + "num_tokens": 1894230.0, + "repeat_count": 0.0, + "routers_loss": 0.004947459790855646, + "skip_count": 0.0, + "step": 1174, + "text_loss": 0.5037549138069153 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.521279718227179, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009891273620423083, + "loss": 0.0428, + "macro_f1": 0.3272727429866791, + "num_tokens": 1897294.0, + "repeat_count": 1.0, + "routers_loss": 0.026075217872858047, + "skip_count": 0.0, + "step": 1176, + "text_loss": 0.32558977603912354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009890630724135314, + "loss": 0.0351, + "macro_f1": 0.3272727429866791, + "num_tokens": 1901553.0, + "repeat_count": 0.0, + "routers_loss": 0.06650999188423157, + "skip_count": 1.0, + "step": 1178, + "text_loss": 0.23473620414733887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.540064572938069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009889985953748625, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 1904556.0, + "repeat_count": 0.0, + "routers_loss": 0.010361116379499435, + "skip_count": 1.0, + "step": 1180, + "text_loss": 0.6927042007446289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009889339309510094, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 1908053.0, + "repeat_count": 0.0, + "routers_loss": 0.013286533765494823, + "skip_count": 0.0, + "step": 1182, + "text_loss": 0.19977325201034546 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 5.558849427648958, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009888690791667518, + "loss": 0.0204, + "macro_f1": 0.7018141150474548, + "num_tokens": 1911754.0, + "repeat_count": 2.0, + "routers_loss": 0.11920545995235443, + "skip_count": 3.0, + "step": 1184, + "text_loss": 0.4072858691215515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.568241855004403, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009888040400469408, + "loss": 0.0391, + "macro_f1": 0.3272727429866791, + "num_tokens": 1914862.0, + "repeat_count": 0.0, + "routers_loss": 0.03652849420905113, + "skip_count": 1.0, + "step": 1186, + "text_loss": 0.2654043138027191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009887388136164996, + "loss": 0.0336, + "macro_f1": 0.5492662787437439, + "num_tokens": 1918542.0, + "repeat_count": 0.0, + "routers_loss": 0.03991910070180893, + "skip_count": 2.0, + "step": 1188, + "text_loss": 0.21130657196044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.587026709715292, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.000988673399900423, + "loss": 0.0429, + "macro_f1": 0.3272727429866791, + "num_tokens": 1921589.0, + "repeat_count": 0.0, + "routers_loss": 0.014900135807693005, + "skip_count": 0.0, + "step": 1190, + "text_loss": 0.5519335865974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.596419137070737, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1884765625, + "learning_rate": 0.0009886077989237777, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 1924320.0, + "repeat_count": 0.0, + "routers_loss": 0.06271552294492722, + "skip_count": 1.0, + "step": 1192, + "text_loss": 0.213813915848732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 5.6058115644261814, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.1875, + "learning_rate": 0.000988542010711702, + "loss": 0.0342, + "macro_f1": 0.6225374937057495, + "num_tokens": 1927178.0, + "repeat_count": 0.0, + "routers_loss": 0.03081391751766205, + "skip_count": 5.0, + "step": 1194, + "text_loss": 0.7524349093437195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.255859375, + "learning_rate": 0.0009884760352894064, + "loss": 0.0518, + "macro_f1": 0.3333333432674408, + "num_tokens": 1930216.0, + "repeat_count": 0.0, + "routers_loss": 0.008556773886084557, + "skip_count": 0.0, + "step": 1196, + "text_loss": 0.28230375051498413 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.62459641913707, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009884098726821726, + "loss": 0.0472, + "macro_f1": 0.4871794879436493, + "num_tokens": 1933312.0, + "repeat_count": 3.0, + "routers_loss": 0.05344727262854576, + "skip_count": 0.0, + "step": 1198, + "text_loss": 0.5509607195854187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.633988846492516, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1298828125, + "learning_rate": 0.000988343522915354, + "loss": 0.0441, + "macro_f1": 0.480392187833786, + "num_tokens": 1936160.0, + "repeat_count": 1.0, + "routers_loss": 0.07324771583080292, + "skip_count": 3.0, + "step": 1200, + "text_loss": 0.30565372109413147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 25.0, + "epoch": 5.64338127384796, + "f1_execute": 0.8936169743537903, + "f1_repeat": 0.0, + "f1_skip": 0.444444477558136, + "grad_norm": 0.2470703125, + "learning_rate": 0.0009882769860143764, + "loss": 0.0317, + "macro_f1": 0.4460204839706421, + "num_tokens": 1939266.0, + "repeat_count": 0.0, + "routers_loss": 0.18620699644088745, + "skip_count": 6.0, + "step": 1202, + "text_loss": 0.976121723651886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.000988210262004737, + "loss": 0.0474, + "macro_f1": 0.6666666865348816, + "num_tokens": 1942173.0, + "repeat_count": 0.0, + "routers_loss": 0.007703613489866257, + "skip_count": 1.0, + "step": 1204, + "text_loss": 0.5647401809692383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.66216612855885, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1484375, + "learning_rate": 0.0009881433509120036, + "loss": 0.0376, + "macro_f1": 0.5492662787437439, + "num_tokens": 1945071.0, + "repeat_count": 0.0, + "routers_loss": 0.02162683941423893, + "skip_count": 2.0, + "step": 1206, + "text_loss": 0.24229218065738678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.671558555914294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009880762527618176, + "loss": 0.0383, + "macro_f1": 0.3333333432674408, + "num_tokens": 1949060.0, + "repeat_count": 0.0, + "routers_loss": 0.017667081207036972, + "skip_count": 0.0, + "step": 1208, + "text_loss": 0.4035970866680145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.0009880089675798908, + "loss": 0.0367, + "macro_f1": 0.3333333432674408, + "num_tokens": 1951698.0, + "repeat_count": 0.0, + "routers_loss": 0.006405784282833338, + "skip_count": 0.0, + "step": 1210, + "text_loss": 0.5319879055023193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009879414953920071, + "loss": 0.0294, + "macro_f1": 0.3333333432674408, + "num_tokens": 1955266.0, + "repeat_count": 0.0, + "routers_loss": 0.009859707206487656, + "skip_count": 0.0, + "step": 1212, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.699735837980628, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.130859375, + "learning_rate": 0.0009878738362240219, + "loss": 0.045, + "macro_f1": 0.5492662787437439, + "num_tokens": 1958538.0, + "repeat_count": 0.0, + "routers_loss": 0.030890554189682007, + "skip_count": 2.0, + "step": 1214, + "text_loss": 0.20820017158985138 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 5.709128265336073, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.1806640625, + "learning_rate": 0.000987805990101862, + "loss": 0.0317, + "macro_f1": 0.47333335876464844, + "num_tokens": 1961419.0, + "repeat_count": 2.0, + "routers_loss": 0.10383198410272598, + "skip_count": 2.0, + "step": 1216, + "text_loss": 0.8664976358413696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009877379570515268, + "loss": 0.0366, + "macro_f1": 0.3333333432674408, + "num_tokens": 1964836.0, + "repeat_count": 0.0, + "routers_loss": 0.013376163318753242, + "skip_count": 0.0, + "step": 1218, + "text_loss": 0.4223395884037018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009876697370990865, + "loss": 0.0343, + "macro_f1": 0.3333333432674408, + "num_tokens": 1967620.0, + "repeat_count": 0.0, + "routers_loss": 0.008577900938689709, + "skip_count": 0.0, + "step": 1220, + "text_loss": 0.4789901375770569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009876013302706828, + "loss": 0.049, + "macro_f1": 0.3333333432674408, + "num_tokens": 1971100.0, + "repeat_count": 0.0, + "routers_loss": 0.004730266984552145, + "skip_count": 0.0, + "step": 1222, + "text_loss": 0.6799837946891785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009875327365925295, + "loss": 0.0341, + "macro_f1": 0.3333333432674408, + "num_tokens": 1974408.0, + "repeat_count": 0.0, + "routers_loss": 0.010849526152014732, + "skip_count": 0.0, + "step": 1224, + "text_loss": 0.18967926502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.756090402113296, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009874639560909118, + "loss": 0.0498, + "macro_f1": 0.32098767161369324, + "num_tokens": 1977046.0, + "repeat_count": 0.0, + "routers_loss": 0.04841252416372299, + "skip_count": 1.0, + "step": 1226, + "text_loss": 0.6133310198783875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.765482829468741, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009873949887921867, + "loss": 0.0402, + "macro_f1": 0.3272727429866791, + "num_tokens": 1980330.0, + "repeat_count": 0.0, + "routers_loss": 0.029638588428497314, + "skip_count": 1.0, + "step": 1228, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.774875256824186, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009873258347227823, + "loss": 0.0331, + "macro_f1": 0.3272727429866791, + "num_tokens": 1983173.0, + "repeat_count": 0.0, + "routers_loss": 0.009955910965800285, + "skip_count": 0.0, + "step": 1230, + "text_loss": 0.4741005599498749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009872564939091989, + "loss": 0.0342, + "macro_f1": 0.3333333432674408, + "num_tokens": 1986825.0, + "repeat_count": 0.0, + "routers_loss": 0.010205300524830818, + "skip_count": 0.0, + "step": 1232, + "text_loss": 0.5315462350845337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 25.0, + "epoch": 5.7936601115350745, + "f1_execute": 0.9302325248718262, + "f1_repeat": 1.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009871869663780077, + "loss": 0.0336, + "macro_f1": 0.8858351111412048, + "num_tokens": 1990448.0, + "repeat_count": 1.0, + "routers_loss": 0.09120134264230728, + "skip_count": 7.0, + "step": 1234, + "text_loss": 0.6187508702278137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 5.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.125, + "learning_rate": 0.0009871172521558522, + "loss": 0.0475, + "macro_f1": 0.6666666865348816, + "num_tokens": 1993474.0, + "repeat_count": 0.0, + "routers_loss": 0.016188839450478554, + "skip_count": 1.0, + "step": 1236, + "text_loss": 0.20783066749572754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 5.812444966245964, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.216796875, + "learning_rate": 0.0009870473512694465, + "loss": 0.0373, + "macro_f1": 0.5934640765190125, + "num_tokens": 1996536.0, + "repeat_count": 0.0, + "routers_loss": 0.05046704784035683, + "skip_count": 3.0, + "step": 1238, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.821837393601409, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009869772637455772, + "loss": 0.0251, + "macro_f1": 0.4871794879436493, + "num_tokens": 1999530.0, + "repeat_count": 0.0, + "routers_loss": 0.044926248490810394, + "skip_count": 2.0, + "step": 1240, + "text_loss": 0.26001980900764465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 5.831229820956853, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1513671875, + "learning_rate": 0.000986906989611102, + "loss": 0.0446, + "macro_f1": 0.3272727429866791, + "num_tokens": 2002782.0, + "repeat_count": 0.0, + "routers_loss": 0.025911526754498482, + "skip_count": 0.0, + "step": 1242, + "text_loss": 0.9009982943534851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009868365288929492, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2005331.0, + "repeat_count": 0.0, + "routers_loss": 0.0043760035187006, + "skip_count": 0.0, + "step": 1244, + "text_loss": 0.5547386407852173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.850014675667743, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009867658816181206, + "loss": 0.0374, + "macro_f1": 0.3333333432674408, + "num_tokens": 2008115.0, + "repeat_count": 0.0, + "routers_loss": 0.009227181784808636, + "skip_count": 0.0, + "step": 1246, + "text_loss": 1.0067731142044067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.126953125, + "learning_rate": 0.000986695047813688, + "loss": 0.0261, + "macro_f1": 0.3272727429866791, + "num_tokens": 2011137.0, + "repeat_count": 1.0, + "routers_loss": 0.023822437971830368, + "skip_count": 0.0, + "step": 1248, + "text_loss": 0.30058956146240234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 5.868799530378633, + "f1_execute": 0.9200000166893005, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009866240275067948, + "loss": 0.044, + "macro_f1": 0.47333335876464844, + "num_tokens": 2014159.0, + "repeat_count": 2.0, + "routers_loss": 0.21523773670196533, + "skip_count": 3.0, + "step": 1250, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 5.878191957734077, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009865528207246563, + "loss": 0.0351, + "macro_f1": 0.5492662787437439, + "num_tokens": 2017731.0, + "repeat_count": 0.0, + "routers_loss": 0.06184682995080948, + "skip_count": 2.0, + "step": 1252, + "text_loss": 0.35751575231552124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.8875843850895215, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.000986481427494559, + "loss": 0.0336, + "macro_f1": 0.3333333432674408, + "num_tokens": 2020485.0, + "repeat_count": 0.0, + "routers_loss": 0.007573372684419155, + "skip_count": 0.0, + "step": 1254, + "text_loss": 0.4061077833175659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.896976812444966, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1708984375, + "learning_rate": 0.000986409847843861, + "loss": 0.0382, + "macro_f1": 0.3272727429866791, + "num_tokens": 2024149.0, + "repeat_count": 1.0, + "routers_loss": 0.07447971403598785, + "skip_count": 0.0, + "step": 1256, + "text_loss": 0.41876497864723206 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000986338081799992, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 2026545.0, + "repeat_count": 0.0, + "routers_loss": 0.006609147880226374, + "skip_count": 0.0, + "step": 1258, + "text_loss": 0.4673794209957123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.915761667155856, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009862661293904523, + "loss": 0.0498, + "macro_f1": 0.32098764181137085, + "num_tokens": 2029581.0, + "repeat_count": 0.0, + "routers_loss": 0.10624702274799347, + "skip_count": 2.0, + "step": 1260, + "text_loss": 0.3483233153820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009861939906428145, + "loss": 0.0525, + "macro_f1": 0.3333333432674408, + "num_tokens": 2033936.0, + "repeat_count": 0.0, + "routers_loss": 0.007944886572659016, + "skip_count": 0.0, + "step": 1262, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009861216655847225, + "loss": 0.0376, + "macro_f1": 0.6666666865348816, + "num_tokens": 2037876.0, + "repeat_count": 1.0, + "routers_loss": 0.007004092447459698, + "skip_count": 0.0, + "step": 1264, + "text_loss": 0.43228110671043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009860491542438912, + "loss": 0.047, + "macro_f1": 0.3272727429866791, + "num_tokens": 2040842.0, + "repeat_count": 0.0, + "routers_loss": 0.026916226372122765, + "skip_count": 1.0, + "step": 1266, + "text_loss": 0.5901188850402832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.953331376577634, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.000985976456648107, + "loss": 0.0353, + "macro_f1": 0.3333333432674408, + "num_tokens": 2043890.0, + "repeat_count": 0.0, + "routers_loss": 0.007325216196477413, + "skip_count": 0.0, + "step": 1268, + "text_loss": 0.8780109882354736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 5.962723803933079, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.10205078125, + "learning_rate": 0.000985903572825228, + "loss": 0.0306, + "macro_f1": 0.4871794879436493, + "num_tokens": 2048848.0, + "repeat_count": 0.0, + "routers_loss": 0.05007527023553848, + "skip_count": 2.0, + "step": 1270, + "text_loss": 0.5863722562789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 5.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.173828125, + "learning_rate": 0.000985830502803183, + "loss": 0.0396, + "macro_f1": 0.3272727429866791, + "num_tokens": 2051561.0, + "repeat_count": 0.0, + "routers_loss": 0.023995524272322655, + "skip_count": 0.0, + "step": 1272, + "text_loss": 0.7460709810256958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009857572466099732, + "loss": 0.0431, + "macro_f1": 0.3333333432674408, + "num_tokens": 2054752.0, + "repeat_count": 0.0, + "routers_loss": 0.006928362417966127, + "skip_count": 0.0, + "step": 1274, + "text_loss": 0.5130293369293213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 5.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.162109375, + "learning_rate": 0.0009856838042736698, + "loss": 0.0501, + "macro_f1": 0.3333333432674408, + "num_tokens": 2058151.0, + "repeat_count": 0.0, + "routers_loss": 0.006969396956264973, + "skip_count": 0.0, + "step": 1276, + "text_loss": 0.5911393761634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009856101758224166, + "loss": 0.0441, + "macro_f1": 0.3333333432674408, + "num_tokens": 2061012.0, + "repeat_count": 0.0, + "routers_loss": 0.003499418031424284, + "skip_count": 0.0, + "step": 1278, + "text_loss": 0.25347545742988586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.000985536361284428, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2064597.0, + "repeat_count": 0.0, + "routers_loss": 0.007856054231524467, + "skip_count": 0.0, + "step": 1280, + "text_loss": 0.7476963400840759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.01878485471089, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009854623606879898, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2067972.0, + "repeat_count": 0.0, + "routers_loss": 0.02617792971432209, + "skip_count": 1.0, + "step": 1282, + "text_loss": 0.5775872468948364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.028177282066334, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.000985388174061459, + "loss": 0.0356, + "macro_f1": 0.32098767161369324, + "num_tokens": 2071812.0, + "repeat_count": 0.0, + "routers_loss": 0.035979997366666794, + "skip_count": 1.0, + "step": 1284, + "text_loss": 0.2933400869369507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.037569709421779, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009853138014332646, + "loss": 0.0273, + "macro_f1": 0.3333333432674408, + "num_tokens": 2074868.0, + "repeat_count": 0.0, + "routers_loss": 0.005142854526638985, + "skip_count": 0.0, + "step": 1286, + "text_loss": 0.29085102677345276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.0009852392428319058, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 2078225.0, + "repeat_count": 0.0, + "routers_loss": 0.0032799106556922197, + "skip_count": 0.0, + "step": 1288, + "text_loss": 0.7293626070022583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 6.056354564132668, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009851644982859537, + "loss": 0.0273, + "macro_f1": 0.480392187833786, + "num_tokens": 2081495.0, + "repeat_count": 1.0, + "routers_loss": 0.12224318832159042, + "skip_count": 3.0, + "step": 1290, + "text_loss": 0.26125892996788025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.065746991488113, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009850895678240508, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2084390.0, + "repeat_count": 1.0, + "routers_loss": 0.010662888176739216, + "skip_count": 0.0, + "step": 1292, + "text_loss": 0.3510764539241791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.075139418843557, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1689453125, + "learning_rate": 0.0009850144514749104, + "loss": 0.0332, + "macro_f1": 0.5492662787437439, + "num_tokens": 2087210.0, + "repeat_count": 0.0, + "routers_loss": 0.01979079470038414, + "skip_count": 2.0, + "step": 1294, + "text_loss": 0.40202176570892334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.084531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.000984939149267317, + "loss": 0.0253, + "macro_f1": 0.6666666865348816, + "num_tokens": 2090777.0, + "repeat_count": 0.0, + "routers_loss": 0.005172552540898323, + "skip_count": 1.0, + "step": 1296, + "text_loss": 0.5275651216506958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.093924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009848636612301272, + "loss": 0.0299, + "macro_f1": 0.3333333432674408, + "num_tokens": 2094248.0, + "repeat_count": 0.0, + "routers_loss": 0.0029599082190543413, + "skip_count": 0.0, + "step": 1298, + "text_loss": 0.4517653286457062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009847879873922675, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2097139.0, + "repeat_count": 0.0, + "routers_loss": 0.011455860920250416, + "skip_count": 0.0, + "step": 1300, + "text_loss": 0.16888445615768433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.112709128265336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0009847121277827366, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2100415.0, + "repeat_count": 0.0, + "routers_loss": 0.008091195486485958, + "skip_count": 0.0, + "step": 1302, + "text_loss": 0.40061676502227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.122101555620781, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.000984636082430604, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2103285.0, + "repeat_count": 0.0, + "routers_loss": 0.009593960829079151, + "skip_count": 0.0, + "step": 1304, + "text_loss": 0.7211073637008667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.107421875, + "learning_rate": 0.0009845598513650103, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2106255.0, + "repeat_count": 0.0, + "routers_loss": 0.0023068038281053305, + "skip_count": 0.0, + "step": 1306, + "text_loss": 0.7077119946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.171875, + "learning_rate": 0.0009844834346151674, + "loss": 0.043, + "macro_f1": 0.3333333432674408, + "num_tokens": 2109305.0, + "repeat_count": 0.0, + "routers_loss": 0.007703019306063652, + "skip_count": 0.0, + "step": 1308, + "text_loss": 0.3534316122531891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.1502788376871145, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009844068322103585, + "loss": 0.0287, + "macro_f1": 0.3272727429866791, + "num_tokens": 2112216.0, + "repeat_count": 0.0, + "routers_loss": 0.023549847304821014, + "skip_count": 1.0, + "step": 1310, + "text_loss": 0.6792599558830261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009843300441799378, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 2114925.0, + "repeat_count": 0.0, + "routers_loss": 0.007605871185660362, + "skip_count": 0.0, + "step": 1312, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.169063692398004, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009842530705533304, + "loss": 0.0253, + "macro_f1": 0.3272727429866791, + "num_tokens": 2117744.0, + "repeat_count": 0.0, + "routers_loss": 0.014964760281145573, + "skip_count": 0.0, + "step": 1314, + "text_loss": 0.7840361595153809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000984175911360033, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2120848.0, + "repeat_count": 0.0, + "routers_loss": 0.004663798492401838, + "skip_count": 0.0, + "step": 1316, + "text_loss": 0.536246120929718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.187848547108893, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1201171875, + "learning_rate": 0.000984098566629613, + "loss": 0.0288, + "macro_f1": 0.5492662787437439, + "num_tokens": 2123651.0, + "repeat_count": 0.0, + "routers_loss": 0.022852955386042595, + "skip_count": 2.0, + "step": 1318, + "text_loss": 0.43372172117233276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.197240974464338, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009840210363917087, + "loss": 0.0216, + "macro_f1": 0.3333333432674408, + "num_tokens": 2128011.0, + "repeat_count": 0.0, + "routers_loss": 0.012578422203660011, + "skip_count": 0.0, + "step": 1320, + "text_loss": 0.28190380334854126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009839433206760306, + "loss": 0.0204, + "macro_f1": 0.3333333432674408, + "num_tokens": 2131035.0, + "repeat_count": 0.0, + "routers_loss": 0.006863643880933523, + "skip_count": 0.0, + "step": 1322, + "text_loss": 0.6340444087982178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.216025829175227, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1796875, + "learning_rate": 0.0009838654195123589, + "loss": 0.0243, + "macro_f1": 0.3333333432674408, + "num_tokens": 2133856.0, + "repeat_count": 0.0, + "routers_loss": 0.00468854233622551, + "skip_count": 0.0, + "step": 1324, + "text_loss": 0.5138425827026367 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009837873329305458, + "loss": 0.0396, + "macro_f1": 0.6666666865348816, + "num_tokens": 2136451.0, + "repeat_count": 1.0, + "routers_loss": 0.005731126759201288, + "skip_count": 0.0, + "step": 1326, + "text_loss": 0.742124617099762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.17578125, + "learning_rate": 0.000983709060960514, + "loss": 0.0416, + "macro_f1": 0.3333333432674408, + "num_tokens": 2139496.0, + "repeat_count": 0.0, + "routers_loss": 0.0056343949399888515, + "skip_count": 0.0, + "step": 1328, + "text_loss": 0.7317464351654053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2442031112415615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009836306036322576, + "loss": 0.0312, + "macro_f1": 0.3333333432674408, + "num_tokens": 2143120.0, + "repeat_count": 0.0, + "routers_loss": 0.005127966403961182, + "skip_count": 0.0, + "step": 1330, + "text_loss": 0.538652241230011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 6.253595538597006, + "f1_execute": 0.9130434989929199, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009835519609758415, + "loss": 0.0301, + "macro_f1": 0.590062141418457, + "num_tokens": 2145807.0, + "repeat_count": 3.0, + "routers_loss": 0.1673707216978073, + "skip_count": 4.0, + "step": 1332, + "text_loss": 0.3498198091983795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009834731330214017, + "loss": 0.0293, + "macro_f1": 0.3272727429866791, + "num_tokens": 2148397.0, + "repeat_count": 1.0, + "routers_loss": 0.04026653990149498, + "skip_count": 0.0, + "step": 1334, + "text_loss": 0.8153424859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 27.0, + "epoch": 6.272380393307896, + "f1_execute": 0.8999999761581421, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.8000000715255737, + "grad_norm": 0.16015625, + "learning_rate": 0.0009833941197991455, + "loss": 0.0329, + "macro_f1": 0.7888889312744141, + "num_tokens": 2152226.0, + "repeat_count": 2.0, + "routers_loss": 0.05481519177556038, + "skip_count": 5.0, + "step": 1336, + "text_loss": 0.7802760004997253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.28177282066334, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009833149213393506, + "loss": 0.0304, + "macro_f1": 0.3272727429866791, + "num_tokens": 2156023.0, + "repeat_count": 0.0, + "routers_loss": 0.01760484278202057, + "skip_count": 0.0, + "step": 1338, + "text_loss": 0.19721226394176483 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.2911652480187845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.000983235537672366, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2160037.0, + "repeat_count": 0.0, + "routers_loss": 0.013206037692725658, + "skip_count": 0.0, + "step": 1340, + "text_loss": 0.5003817081451416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.000983155968828612, + "loss": 0.0315, + "macro_f1": 0.6666666865348816, + "num_tokens": 2163910.0, + "repeat_count": 1.0, + "routers_loss": 0.01256406120955944, + "skip_count": 0.0, + "step": 1342, + "text_loss": 0.5996923446655273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.309950102729674, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009830762148385793, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2166921.0, + "repeat_count": 0.0, + "routers_loss": 0.015086234547197819, + "skip_count": 1.0, + "step": 1344, + "text_loss": 0.45356282591819763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.319342530085119, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009829962757328297, + "loss": 0.0223, + "macro_f1": 0.32098764181137085, + "num_tokens": 2170135.0, + "repeat_count": 0.0, + "routers_loss": 0.07909081131219864, + "skip_count": 2.0, + "step": 1346, + "text_loss": 0.2874644994735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 6.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009829161515419959, + "loss": 0.0246, + "macro_f1": 0.6666666865348816, + "num_tokens": 2173029.0, + "repeat_count": 0.0, + "routers_loss": 0.013569854199886322, + "skip_count": 2.0, + "step": 1348, + "text_loss": 0.25533875823020935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3381273847960085, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009828358422967823, + "loss": 0.0226, + "macro_f1": 0.32098764181137085, + "num_tokens": 2176605.0, + "repeat_count": 1.0, + "routers_loss": 0.08111091703176498, + "skip_count": 1.0, + "step": 1350, + "text_loss": 0.32827726006507874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 6.347519812151453, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.091796875, + "learning_rate": 0.0009827553480279627, + "loss": 0.03, + "macro_f1": 0.5427350401878357, + "num_tokens": 2179406.0, + "repeat_count": 0.0, + "routers_loss": 0.026550088077783585, + "skip_count": 2.0, + "step": 1352, + "text_loss": 0.2966301143169403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009826746687663832, + "loss": 0.0301, + "macro_f1": 0.3333333432674408, + "num_tokens": 2182353.0, + "repeat_count": 0.0, + "routers_loss": 0.003914554137736559, + "skip_count": 0.0, + "step": 1354, + "text_loss": 0.7596251964569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 6.366304666862343, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0859375, + "learning_rate": 0.0009825938045429602, + "loss": 0.0324, + "macro_f1": 0.5866667032241821, + "num_tokens": 2185786.0, + "repeat_count": 1.0, + "routers_loss": 0.059612665325403214, + "skip_count": 3.0, + "step": 1356, + "text_loss": 0.12325898557901382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.375697094217787, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009825127553886807, + "loss": 0.0375, + "macro_f1": 0.3333333432674408, + "num_tokens": 2190157.0, + "repeat_count": 0.0, + "routers_loss": 0.0071132429875433445, + "skip_count": 0.0, + "step": 1358, + "text_loss": 0.9287898540496826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.3850895215732315, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009824315213346033, + "loss": 0.0348, + "macro_f1": 0.3333333432674408, + "num_tokens": 2193077.0, + "repeat_count": 0.0, + "routers_loss": 0.009611099027097225, + "skip_count": 0.0, + "step": 1360, + "text_loss": 0.20427259802818298 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.394481948928676, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009823501024118569, + "loss": 0.0285, + "macro_f1": 0.3333333432674408, + "num_tokens": 2196494.0, + "repeat_count": 0.0, + "routers_loss": 0.006913455203175545, + "skip_count": 0.0, + "step": 1362, + "text_loss": 0.574759840965271 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.403874376284121, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009822684986516411, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 2199839.0, + "repeat_count": 0.0, + "routers_loss": 0.009208920411765575, + "skip_count": 0.0, + "step": 1364, + "text_loss": 0.42422571778297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.413266803639566, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000982186710085227, + "loss": 0.0208, + "macro_f1": 0.32098764181137085, + "num_tokens": 2203212.0, + "repeat_count": 1.0, + "routers_loss": 0.059975091367959976, + "skip_count": 1.0, + "step": 1366, + "text_loss": 0.29213017225265503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.25, + "avg_layers": 27.0, + "epoch": 6.42265923099501, + "f1_execute": 0.9411765336990356, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.181640625, + "learning_rate": 0.0009821047367439561, + "loss": 0.0358, + "macro_f1": 0.44705885648727417, + "num_tokens": 2206240.0, + "repeat_count": 0.0, + "routers_loss": 0.048244867473840714, + "skip_count": 4.0, + "step": 1368, + "text_loss": 0.3072395324707031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009820225786592405, + "loss": 0.0375, + "macro_f1": 0.3272727429866791, + "num_tokens": 2209903.0, + "repeat_count": 1.0, + "routers_loss": 0.026068156585097313, + "skip_count": 0.0, + "step": 1370, + "text_loss": 0.5961400270462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4414440857059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.109375, + "learning_rate": 0.0009819402358625634, + "loss": 0.0366, + "macro_f1": 0.3272727429866791, + "num_tokens": 2213439.0, + "repeat_count": 0.0, + "routers_loss": 0.022615568712353706, + "skip_count": 1.0, + "step": 1372, + "text_loss": 0.19375644624233246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.450836513061344, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.000981857708385479, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2216457.0, + "repeat_count": 0.0, + "routers_loss": 0.005855285096913576, + "skip_count": 0.0, + "step": 1374, + "text_loss": 0.5123368501663208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.460228940416789, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009817749962596114, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2219975.0, + "repeat_count": 1.0, + "routers_loss": 0.0651634931564331, + "skip_count": 0.0, + "step": 1376, + "text_loss": 0.5999220609664917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009816920995166568, + "loss": 0.0371, + "macro_f1": 0.6666666865348816, + "num_tokens": 2222833.0, + "repeat_count": 1.0, + "routers_loss": 0.011408994905650616, + "skip_count": 0.0, + "step": 1378, + "text_loss": 0.5323230624198914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.4790137951276785, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.205078125, + "learning_rate": 0.0009816090181883807, + "loss": 0.0313, + "macro_f1": 0.32098764181137085, + "num_tokens": 2225842.0, + "repeat_count": 0.0, + "routers_loss": 0.039720915257930756, + "skip_count": 2.0, + "step": 1380, + "text_loss": 0.23363439738750458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009815257523066204, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 2229430.0, + "repeat_count": 0.0, + "routers_loss": 0.002765297656878829, + "skip_count": 0.0, + "step": 1382, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.497798649838567, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.130859375, + "learning_rate": 0.0009814423019032835, + "loss": 0.0396, + "macro_f1": 0.5492662787437439, + "num_tokens": 2232594.0, + "repeat_count": 2.0, + "routers_loss": 0.05362323671579361, + "skip_count": 0.0, + "step": 1384, + "text_loss": 0.6392166614532471 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.507191077194013, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009813586670103483, + "loss": 0.0426, + "macro_f1": 0.6603773832321167, + "num_tokens": 2236327.0, + "repeat_count": 1.0, + "routers_loss": 0.031728316098451614, + "skip_count": 1.0, + "step": 1386, + "text_loss": 0.5951619148254395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.126953125, + "learning_rate": 0.0009812748476598638, + "loss": 0.031, + "macro_f1": 0.5492662787437439, + "num_tokens": 2239746.0, + "repeat_count": 0.0, + "routers_loss": 0.03981253132224083, + "skip_count": 2.0, + "step": 1388, + "text_loss": 0.22756551206111908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.5259759319049016, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009811908438839498, + "loss": 0.0331, + "macro_f1": 0.5492662787437439, + "num_tokens": 2242786.0, + "repeat_count": 0.0, + "routers_loss": 0.04617162421345711, + "skip_count": 2.0, + "step": 1390, + "text_loss": 0.3233799934387207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.535368359260346, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.154296875, + "learning_rate": 0.000981106655714797, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2245696.0, + "repeat_count": 0.0, + "routers_loss": 0.046828847378492355, + "skip_count": 1.0, + "step": 1392, + "text_loss": 0.24273279309272766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.544760786615791, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009810222831846656, + "loss": 0.0307, + "macro_f1": 0.5492662787437439, + "num_tokens": 2249326.0, + "repeat_count": 0.0, + "routers_loss": 0.010921589098870754, + "skip_count": 2.0, + "step": 1394, + "text_loss": 0.3921460807323456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.554153213971236, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009809377263258882, + "loss": 0.0315, + "macro_f1": 0.32098767161369324, + "num_tokens": 2253393.0, + "repeat_count": 0.0, + "routers_loss": 0.04564022272825241, + "skip_count": 1.0, + "step": 1396, + "text_loss": 0.582602858543396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000980852985170867, + "loss": 0.0328, + "macro_f1": 0.3272727429866791, + "num_tokens": 2256626.0, + "repeat_count": 0.0, + "routers_loss": 0.013289985246956348, + "skip_count": 0.0, + "step": 1398, + "text_loss": 0.41031694412231445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.5729380686821255, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009807680597520745, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2259326.0, + "repeat_count": 0.0, + "routers_loss": 0.0065213534981012344, + "skip_count": 0.0, + "step": 1400, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.58233049603757, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.23046875, + "learning_rate": 0.0009806829501020546, + "loss": 0.0358, + "macro_f1": 0.3272727429866791, + "num_tokens": 2262344.0, + "repeat_count": 0.0, + "routers_loss": 0.04199840500950813, + "skip_count": 1.0, + "step": 1402, + "text_loss": 0.31973034143447876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009805976562534215, + "loss": 0.0317, + "macro_f1": 0.6603773832321167, + "num_tokens": 2266354.0, + "repeat_count": 1.0, + "routers_loss": 0.015434930101037025, + "skip_count": 1.0, + "step": 1404, + "text_loss": 0.508630633354187 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 6.601115350748459, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009805121782388599, + "loss": 0.0339, + "macro_f1": 0.6533333659172058, + "num_tokens": 2269660.0, + "repeat_count": 2.0, + "routers_loss": 0.0720924660563469, + "skip_count": 2.0, + "step": 1406, + "text_loss": 0.40927737951278687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.610507778103904, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009804265160911253, + "loss": 0.0266, + "macro_f1": 0.5492662787437439, + "num_tokens": 2273335.0, + "repeat_count": 0.0, + "routers_loss": 0.02400495670735836, + "skip_count": 2.0, + "step": 1408, + "text_loss": 0.1777762621641159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.6199002054593485, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2314453125, + "learning_rate": 0.0009803406698430433, + "loss": 0.0371, + "macro_f1": 0.3272727429866791, + "num_tokens": 2277107.0, + "repeat_count": 0.0, + "routers_loss": 0.02560107782483101, + "skip_count": 1.0, + "step": 1410, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.629292632814793, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009802546395275104, + "loss": 0.0349, + "macro_f1": 0.3333333432674408, + "num_tokens": 2281638.0, + "repeat_count": 0.0, + "routers_loss": 0.006655813194811344, + "skip_count": 0.0, + "step": 1412, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 6.638685060170237, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.08740234375, + "learning_rate": 0.000980168425177494, + "loss": 0.0342, + "macro_f1": 0.8200000524520874, + "num_tokens": 2284876.0, + "repeat_count": 1.0, + "routers_loss": 0.06325097382068634, + "skip_count": 3.0, + "step": 1414, + "text_loss": 0.26035264134407043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.000980082026826031, + "loss": 0.0315, + "macro_f1": 0.3272727429866791, + "num_tokens": 2288938.0, + "repeat_count": 1.0, + "routers_loss": 0.013436575420200825, + "skip_count": 0.0, + "step": 1416, + "text_loss": 0.5502325892448425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.657469914881127, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009799954445062296, + "loss": 0.0193, + "macro_f1": 0.6603773832321167, + "num_tokens": 2292317.0, + "repeat_count": 1.0, + "routers_loss": 0.011264479719102383, + "skip_count": 1.0, + "step": 1418, + "text_loss": 0.48075684905052185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 6.666862342236572, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009799086782512686, + "loss": 0.0292, + "macro_f1": 0.5492662787437439, + "num_tokens": 2295935.0, + "repeat_count": 0.0, + "routers_loss": 0.02833271212875843, + "skip_count": 2.0, + "step": 1420, + "text_loss": 0.18221206963062286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09375, + "learning_rate": 0.0009798217280943967, + "loss": 0.0356, + "macro_f1": 0.6666666865348816, + "num_tokens": 2298927.0, + "repeat_count": 0.0, + "routers_loss": 0.009208574891090393, + "skip_count": 1.0, + "step": 1422, + "text_loss": 0.48686322569847107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.685647196947461, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0009797345940689335, + "loss": 0.0267, + "macro_f1": 0.3272727429866791, + "num_tokens": 2301541.0, + "repeat_count": 0.0, + "routers_loss": 0.015011847950518131, + "skip_count": 0.0, + "step": 1424, + "text_loss": 0.49446266889572144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.4000000059604645, + "avg_layers": 26.0, + "epoch": 6.695039624302906, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.0, + "f1_skip": 0.5714285969734192, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009796472762082687, + "loss": 0.0338, + "macro_f1": 0.5034013986587524, + "num_tokens": 2304589.0, + "repeat_count": 0.0, + "routers_loss": 0.05912091210484505, + "skip_count": 5.0, + "step": 1426, + "text_loss": 0.23945684731006622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.70443205165835, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.000979559774545863, + "loss": 0.0405, + "macro_f1": 0.3272727429866791, + "num_tokens": 2307860.0, + "repeat_count": 0.0, + "routers_loss": 0.021242303773760796, + "skip_count": 1.0, + "step": 1428, + "text_loss": 0.531273365020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.000979472089115247, + "loss": 0.0276, + "macro_f1": 0.32098764181137085, + "num_tokens": 2311581.0, + "repeat_count": 0.0, + "routers_loss": 0.02768544852733612, + "skip_count": 2.0, + "step": 1430, + "text_loss": 0.2497459501028061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.000979384219950022, + "loss": 0.0346, + "macro_f1": 0.3333333432674408, + "num_tokens": 2314639.0, + "repeat_count": 0.0, + "routers_loss": 0.008678150363266468, + "skip_count": 0.0, + "step": 1432, + "text_loss": 0.6579355001449585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.732609333724684, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009792961670838595, + "loss": 0.0362, + "macro_f1": 0.3272727429866791, + "num_tokens": 2317927.0, + "repeat_count": 1.0, + "routers_loss": 0.03325597569346428, + "skip_count": 0.0, + "step": 1434, + "text_loss": 0.5209436416625977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.742001761080129, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009792079305505016, + "loss": 0.0306, + "macro_f1": 0.3272727429866791, + "num_tokens": 2321065.0, + "repeat_count": 1.0, + "routers_loss": 0.019228918477892876, + "skip_count": 0.0, + "step": 1436, + "text_loss": 0.41087067127227783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10986328125, + "learning_rate": 0.000979119510383761, + "loss": 0.0371, + "macro_f1": 0.3333333432674408, + "num_tokens": 2323714.0, + "repeat_count": 0.0, + "routers_loss": 0.017071325331926346, + "skip_count": 0.0, + "step": 1438, + "text_loss": 0.21490029990673065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.760786615791019, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2060546875, + "learning_rate": 0.00097903090661752, + "loss": 0.0309, + "macro_f1": 0.3333333432674408, + "num_tokens": 2326454.0, + "repeat_count": 0.0, + "routers_loss": 0.00991755723953247, + "skip_count": 0.0, + "step": 1440, + "text_loss": 0.23847346007823944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.232421875, + "learning_rate": 0.000978942119285732, + "loss": 0.0404, + "macro_f1": 0.3272727429866791, + "num_tokens": 2329462.0, + "repeat_count": 0.0, + "routers_loss": 0.04908733069896698, + "skip_count": 1.0, + "step": 1442, + "text_loss": 0.23343028128147125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.7795714705019074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009788531484224204, + "loss": 0.0264, + "macro_f1": 0.3333333432674408, + "num_tokens": 2332146.0, + "repeat_count": 0.0, + "routers_loss": 0.0032628148328512907, + "skip_count": 0.0, + "step": 1444, + "text_loss": 0.47423800826072693 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 6.788963897857353, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009787639940616788, + "loss": 0.0405, + "macro_f1": 0.7018141150474548, + "num_tokens": 2335738.0, + "repeat_count": 1.0, + "routers_loss": 0.14336998760700226, + "skip_count": 3.0, + "step": 1446, + "text_loss": 0.21837592124938965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.189453125, + "learning_rate": 0.0009786746562376717, + "loss": 0.0241, + "macro_f1": 0.6666666865348816, + "num_tokens": 2338488.0, + "repeat_count": 0.0, + "routers_loss": 0.010542908683419228, + "skip_count": 1.0, + "step": 1448, + "text_loss": 1.0614757537841797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.807748752568242, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009785851349846334, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2342074.0, + "repeat_count": 0.0, + "routers_loss": 0.005998016335070133, + "skip_count": 0.0, + "step": 1450, + "text_loss": 0.4269719421863556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 6.817141179923686, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009784954303368686, + "loss": 0.0384, + "macro_f1": 0.44705885648727417, + "num_tokens": 2345838.0, + "repeat_count": 0.0, + "routers_loss": 0.0959126204252243, + "skip_count": 3.0, + "step": 1452, + "text_loss": 0.3315916955471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1005859375, + "learning_rate": 0.0009784055423287521, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 2348939.0, + "repeat_count": 0.0, + "routers_loss": 0.0025467623490840197, + "skip_count": 0.0, + "step": 1454, + "text_loss": 0.6162732839584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.835926034634576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009783154709947293, + "loss": 0.0256, + "macro_f1": 0.3272727429866791, + "num_tokens": 2352232.0, + "repeat_count": 0.0, + "routers_loss": 0.01860538125038147, + "skip_count": 1.0, + "step": 1456, + "text_loss": 0.23928768932819366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.84531846199002, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009782252163693158, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 2355159.0, + "repeat_count": 0.0, + "routers_loss": 0.04412713274359703, + "skip_count": 1.0, + "step": 1458, + "text_loss": 0.3371323347091675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.21484375, + "learning_rate": 0.0009781347784870973, + "loss": 0.0379, + "macro_f1": 0.3333333432674408, + "num_tokens": 2358175.0, + "repeat_count": 0.0, + "routers_loss": 0.006809141952544451, + "skip_count": 0.0, + "step": 1460, + "text_loss": 0.547267735004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.86410331670091, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009780441573827296, + "loss": 0.03, + "macro_f1": 0.3076923191547394, + "num_tokens": 2360991.0, + "repeat_count": 0.0, + "routers_loss": 0.08924390375614166, + "skip_count": 4.0, + "step": 1462, + "text_loss": 0.7026563882827759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1865234375, + "learning_rate": 0.000977953353090939, + "loss": 0.0272, + "macro_f1": 0.3333333432674408, + "num_tokens": 2363894.0, + "repeat_count": 0.0, + "routers_loss": 0.021858472377061844, + "skip_count": 0.0, + "step": 1464, + "text_loss": 0.2718065083026886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.882888171411799, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009778623656465219, + "loss": 0.0338, + "macro_f1": 0.32098764181137085, + "num_tokens": 2367265.0, + "repeat_count": 0.0, + "routers_loss": 0.044781096279621124, + "skip_count": 0.0, + "step": 1466, + "text_loss": 0.5008095502853394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.892280598767244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009777711950843448, + "loss": 0.0212, + "macro_f1": 0.3333333432674408, + "num_tokens": 2370186.0, + "repeat_count": 0.0, + "routers_loss": 0.0040459707379341125, + "skip_count": 0.0, + "step": 1468, + "text_loss": 0.5242461562156677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 6.901673026122689, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009776798414393446, + "loss": 0.0279, + "macro_f1": 0.6598639488220215, + "num_tokens": 2373314.0, + "repeat_count": 1.0, + "routers_loss": 0.0708528608083725, + "skip_count": 3.0, + "step": 1470, + "text_loss": 0.2821732461452484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.911065453478133, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009775883047465279, + "loss": 0.0414, + "macro_f1": 0.31446540355682373, + "num_tokens": 2376435.0, + "repeat_count": 1.0, + "routers_loss": 0.0290578193962574, + "skip_count": 1.0, + "step": 1472, + "text_loss": 0.8438440561294556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.9204578808335775, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10546875, + "learning_rate": 0.000977496585040972, + "loss": 0.0373, + "macro_f1": 0.3333333432674408, + "num_tokens": 2380244.0, + "repeat_count": 0.0, + "routers_loss": 0.010360375046730042, + "skip_count": 0.0, + "step": 1474, + "text_loss": 0.4356135427951813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 6.929850308189023, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.000977404682357824, + "loss": 0.0294, + "macro_f1": 0.3272727429866791, + "num_tokens": 2383498.0, + "repeat_count": 0.0, + "routers_loss": 0.023518972098827362, + "skip_count": 0.0, + "step": 1476, + "text_loss": 0.25195425748825073 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 6.939242735544467, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.000977312596732301, + "loss": 0.0375, + "macro_f1": 0.9544159770011902, + "num_tokens": 2386414.0, + "repeat_count": 5.0, + "routers_loss": 0.08190606534481049, + "skip_count": 4.0, + "step": 1478, + "text_loss": 0.6586798429489136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 6.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009772203281996905, + "loss": 0.0336, + "macro_f1": 1.0, + "num_tokens": 2389399.0, + "repeat_count": 1.0, + "routers_loss": 0.016441475600004196, + "skip_count": 2.0, + "step": 1480, + "text_loss": 0.3671986758708954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009771278767953502, + "loss": 0.0357, + "macro_f1": 0.3333333432674408, + "num_tokens": 2392400.0, + "repeat_count": 0.0, + "routers_loss": 0.019211363047361374, + "skip_count": 0.0, + "step": 1482, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.967420017610801, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009770352425547072, + "loss": 0.0292, + "macro_f1": 0.3333333432674408, + "num_tokens": 2395123.0, + "repeat_count": 0.0, + "routers_loss": 0.015800386667251587, + "skip_count": 0.0, + "step": 1484, + "text_loss": 0.19896622002124786 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 6.976812444966246, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.5, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009769424255132596, + "loss": 0.0256, + "macro_f1": 0.4871794879436493, + "num_tokens": 2397359.0, + "repeat_count": 3.0, + "routers_loss": 0.06670158356428146, + "skip_count": 0.0, + "step": 1486, + "text_loss": 0.4229799509048462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.98620487232169, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009768494257065747, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 2400387.0, + "repeat_count": 0.0, + "routers_loss": 0.011144762858748436, + "skip_count": 1.0, + "step": 1488, + "text_loss": 0.4264226257801056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 6.995597299677136, + "f1_execute": 0.9019608497619629, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009767562431702904, + "loss": 0.0387, + "macro_f1": 0.3006536364555359, + "num_tokens": 2403241.0, + "repeat_count": 2.0, + "routers_loss": 0.12339717149734497, + "skip_count": 3.0, + "step": 1490, + "text_loss": 0.2850193977355957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.004696213677723, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009766628779401142, + "loss": 0.0215, + "macro_f1": 0.6666666865348816, + "num_tokens": 2406087.0, + "repeat_count": 0.0, + "routers_loss": 0.008174685761332512, + "skip_count": 1.0, + "step": 1492, + "text_loss": 0.6756544709205627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000976569330051824, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 2409312.0, + "repeat_count": 0.0, + "routers_loss": 0.0021256296895444393, + "skip_count": 0.0, + "step": 1494, + "text_loss": 0.4789894223213196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0234810683886115, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009764755995412677, + "loss": 0.0193, + "macro_f1": 0.3333333432674408, + "num_tokens": 2412758.0, + "repeat_count": 0.0, + "routers_loss": 0.003944927826523781, + "skip_count": 0.0, + "step": 1496, + "text_loss": 0.5157490968704224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.032873495744056, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009763816864443627, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2416079.0, + "repeat_count": 1.0, + "routers_loss": 0.03893325850367546, + "skip_count": 0.0, + "step": 1498, + "text_loss": 0.28045418858528137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009762875907970968, + "loss": 0.0199, + "macro_f1": 0.3333333432674408, + "num_tokens": 2420340.0, + "repeat_count": 0.0, + "routers_loss": 0.0017725443467497826, + "skip_count": 0.0, + "step": 1500, + "text_loss": 0.35550856590270996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.051658350454946, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009761933126355277, + "loss": 0.0245, + "macro_f1": 0.3272727429866791, + "num_tokens": 2424735.0, + "repeat_count": 0.0, + "routers_loss": 0.01393749937415123, + "skip_count": 1.0, + "step": 1502, + "text_loss": 0.38840189576148987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009760988519957828, + "loss": 0.0249, + "macro_f1": 0.6666666865348816, + "num_tokens": 2428132.0, + "repeat_count": 0.0, + "routers_loss": 0.01687910407781601, + "skip_count": 2.0, + "step": 1504, + "text_loss": 0.3031681478023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.0704432051658355, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009760042089140598, + "loss": 0.0193, + "macro_f1": 0.3144654333591461, + "num_tokens": 2431592.0, + "repeat_count": 1.0, + "routers_loss": 0.04704280197620392, + "skip_count": 2.0, + "step": 1506, + "text_loss": 0.16355200111865997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009759093834266259, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2434236.0, + "repeat_count": 0.0, + "routers_loss": 0.0016075772000476718, + "skip_count": 0.0, + "step": 1508, + "text_loss": 0.6080073118209839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009758143755698186, + "loss": 0.015, + "macro_f1": 0.3333333432674408, + "num_tokens": 2437170.0, + "repeat_count": 0.0, + "routers_loss": 0.008451299741864204, + "skip_count": 0.0, + "step": 1510, + "text_loss": 0.22100484371185303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.098620487232169, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009757191853800449, + "loss": 0.0227, + "macro_f1": 0.5866667032241821, + "num_tokens": 2441187.0, + "repeat_count": 1.0, + "routers_loss": 0.046565692871809006, + "skip_count": 3.0, + "step": 1512, + "text_loss": 0.25098952651023865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.108012914587614, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.000975623812893782, + "loss": 0.0276, + "macro_f1": 0.3272727429866791, + "num_tokens": 2444664.0, + "repeat_count": 0.0, + "routers_loss": 0.02872578240931034, + "skip_count": 1.0, + "step": 1514, + "text_loss": 0.4952253997325897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.1174053419430585, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1142578125, + "learning_rate": 0.0009755282581475768, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2447748.0, + "repeat_count": 0.0, + "routers_loss": 0.002055214950814843, + "skip_count": 0.0, + "step": 1516, + "text_loss": 0.7465500831604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.126797769298503, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10302734375, + "learning_rate": 0.000975432521178046, + "loss": 0.0216, + "macro_f1": 0.3272727429866791, + "num_tokens": 2450834.0, + "repeat_count": 1.0, + "routers_loss": 0.04498551785945892, + "skip_count": 0.0, + "step": 1518, + "text_loss": 0.28144413232803345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009753366020218763, + "loss": 0.0234, + "macro_f1": 0.3333333432674408, + "num_tokens": 2454233.0, + "repeat_count": 0.0, + "routers_loss": 0.003669742727652192, + "skip_count": 0.0, + "step": 1520, + "text_loss": 0.5667551755905151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009752405007158238, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2457331.0, + "repeat_count": 0.0, + "routers_loss": 0.010455607436597347, + "skip_count": 0.0, + "step": 1522, + "text_loss": 0.19575810432434082 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.154975051364837, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009751442172967151, + "loss": 0.0193, + "macro_f1": 0.8823530077934265, + "num_tokens": 2459935.0, + "repeat_count": 2.0, + "routers_loss": 0.025189083069562912, + "skip_count": 1.0, + "step": 1524, + "text_loss": 0.45453405380249023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.164367478720282, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.000975047751801446, + "loss": 0.0187, + "macro_f1": 0.3272727429866791, + "num_tokens": 2463008.0, + "repeat_count": 0.0, + "routers_loss": 0.012297490611672401, + "skip_count": 0.0, + "step": 1526, + "text_loss": 0.31437572836875916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009749511042669823, + "loss": 0.0233, + "macro_f1": 0.3333333432674408, + "num_tokens": 2466475.0, + "repeat_count": 0.0, + "routers_loss": 0.011026266030967236, + "skip_count": 0.0, + "step": 1528, + "text_loss": 0.46604859828948975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.183152333431171, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009748542747303595, + "loss": 0.0182, + "macro_f1": 0.3272727429866791, + "num_tokens": 2469320.0, + "repeat_count": 0.0, + "routers_loss": 0.011934996582567692, + "skip_count": 1.0, + "step": 1530, + "text_loss": 0.7764923572540283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009747572632286827, + "loss": 0.0203, + "macro_f1": 0.3333333432674408, + "num_tokens": 2472468.0, + "repeat_count": 0.0, + "routers_loss": 0.005786920432001352, + "skip_count": 0.0, + "step": 1532, + "text_loss": 0.3555782437324524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009746600697991271, + "loss": 0.02, + "macro_f1": 0.6666666865348816, + "num_tokens": 2475736.0, + "repeat_count": 1.0, + "routers_loss": 0.0026990731712430716, + "skip_count": 0.0, + "step": 1534, + "text_loss": 0.49561792612075806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 7.2113296154975055, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009745626944789375, + "loss": 0.0204, + "macro_f1": 0.8823530077934265, + "num_tokens": 2478887.0, + "repeat_count": 1.0, + "routers_loss": 0.020221207290887833, + "skip_count": 2.0, + "step": 1536, + "text_loss": 0.5375416278839111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.22072204285295, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12158203125, + "learning_rate": 0.0009744651373054279, + "loss": 0.0286, + "macro_f1": 0.3272727429866791, + "num_tokens": 2481293.0, + "repeat_count": 0.0, + "routers_loss": 0.03131086751818657, + "skip_count": 1.0, + "step": 1538, + "text_loss": 0.5241039395332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 7.230114470208394, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08984375, + "learning_rate": 0.0009743673983159828, + "loss": 0.0241, + "macro_f1": 0.6122449040412903, + "num_tokens": 2484403.0, + "repeat_count": 0.0, + "routers_loss": 0.04448170214891434, + "skip_count": 4.0, + "step": 1540, + "text_loss": 0.7465724349021912 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009742694775480557, + "loss": 0.0265, + "macro_f1": 0.6666666865348816, + "num_tokens": 2487952.0, + "repeat_count": 0.0, + "routers_loss": 0.007171491626650095, + "skip_count": 1.0, + "step": 1542, + "text_loss": 0.2877117097377777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009741713750391703, + "loss": 0.0171, + "macro_f1": 0.6666666865348816, + "num_tokens": 2490815.0, + "repeat_count": 1.0, + "routers_loss": 0.004559285007417202, + "skip_count": 0.0, + "step": 1544, + "text_loss": 0.6097800135612488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.258291752274729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009740730908269193, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 2494727.0, + "repeat_count": 0.0, + "routers_loss": 0.005271553061902523, + "skip_count": 0.0, + "step": 1546, + "text_loss": 0.5431114435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009739746249489658, + "loss": 0.0239, + "macro_f1": 0.3333333432674408, + "num_tokens": 2499266.0, + "repeat_count": 0.0, + "routers_loss": 0.0015409323386847973, + "skip_count": 0.0, + "step": 1548, + "text_loss": 0.4702678322792053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.277076606985618, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009738759774430417, + "loss": 0.0216, + "macro_f1": 0.32098764181137085, + "num_tokens": 2502273.0, + "repeat_count": 1.0, + "routers_loss": 0.030183158814907074, + "skip_count": 1.0, + "step": 1550, + "text_loss": 0.3239189088344574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.286469034341063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009737771483469493, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2507624.0, + "repeat_count": 0.0, + "routers_loss": 0.005410848651081324, + "skip_count": 0.0, + "step": 1552, + "text_loss": 0.4014642834663391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009736781376985598, + "loss": 0.0168, + "macro_f1": 0.6666666865348816, + "num_tokens": 2510366.0, + "repeat_count": 0.0, + "routers_loss": 0.0066976165398955345, + "skip_count": 1.0, + "step": 1554, + "text_loss": 0.5924848914146423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.13671875, + "learning_rate": 0.0009735789455358144, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2513317.0, + "repeat_count": 0.0, + "routers_loss": 0.002763477386906743, + "skip_count": 0.0, + "step": 1556, + "text_loss": 0.3222943842411041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.314646316407397, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009734795718967237, + "loss": 0.0283, + "macro_f1": 0.32098764181137085, + "num_tokens": 2516628.0, + "repeat_count": 0.0, + "routers_loss": 0.061566028743982315, + "skip_count": 2.0, + "step": 1558, + "text_loss": 0.3249334692955017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009733800168193679, + "loss": 0.0228, + "macro_f1": 1.0, + "num_tokens": 2519424.0, + "repeat_count": 2.0, + "routers_loss": 0.017976421862840652, + "skip_count": 4.0, + "step": 1560, + "text_loss": 0.3341919481754303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.333431171118286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1826171875, + "learning_rate": 0.0009732802803418966, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2522922.0, + "repeat_count": 0.0, + "routers_loss": 0.002525332849472761, + "skip_count": 0.0, + "step": 1562, + "text_loss": 0.3176332712173462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.34282359847373, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009731803625025292, + "loss": 0.0196, + "macro_f1": 0.3272727429866791, + "num_tokens": 2525811.0, + "repeat_count": 0.0, + "routers_loss": 0.015524424612522125, + "skip_count": 1.0, + "step": 1564, + "text_loss": 0.532774031162262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.3522160258291755, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10205078125, + "learning_rate": 0.0009730802633395541, + "loss": 0.0257, + "macro_f1": 0.6603773832321167, + "num_tokens": 2529157.0, + "repeat_count": 1.0, + "routers_loss": 0.08138631284236908, + "skip_count": 1.0, + "step": 1566, + "text_loss": 0.529487133026123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009729799828913298, + "loss": 0.0223, + "macro_f1": 0.3333333432674408, + "num_tokens": 2532249.0, + "repeat_count": 0.0, + "routers_loss": 0.0035867292899638414, + "skip_count": 0.0, + "step": 1568, + "text_loss": 0.503160297870636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009728795211962838, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2535904.0, + "repeat_count": 0.0, + "routers_loss": 0.02987455204129219, + "skip_count": 2.0, + "step": 1570, + "text_loss": 0.9170270562171936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.380393307895509, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.0009727788782929131, + "loss": 0.0273, + "macro_f1": 0.3272727429866791, + "num_tokens": 2538943.0, + "repeat_count": 1.0, + "routers_loss": 0.04676021635532379, + "skip_count": 0.0, + "step": 1572, + "text_loss": 0.29146310687065125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.389785735250954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009726780542197844, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 2541805.0, + "repeat_count": 0.0, + "routers_loss": 0.002127803163602948, + "skip_count": 0.0, + "step": 1574, + "text_loss": 1.0126502513885498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.142578125, + "learning_rate": 0.0009725770490155338, + "loss": 0.0262, + "macro_f1": 0.3333333432674408, + "num_tokens": 2546213.0, + "repeat_count": 0.0, + "routers_loss": 0.007609677035361528, + "skip_count": 0.0, + "step": 1576, + "text_loss": 0.190168559551239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.408570589961843, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009724758627188665, + "loss": 0.0356, + "macro_f1": 0.3272727429866791, + "num_tokens": 2549554.0, + "repeat_count": 0.0, + "routers_loss": 0.033554721623659134, + "skip_count": 1.0, + "step": 1578, + "text_loss": 0.2977406084537506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.4179630173172875, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.140625, + "learning_rate": 0.0009723744953685572, + "loss": 0.028, + "macro_f1": 0.3272727429866791, + "num_tokens": 2552785.0, + "repeat_count": 1.0, + "routers_loss": 0.027864238247275352, + "skip_count": 0.0, + "step": 1580, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19921875, + "learning_rate": 0.0009722729470034503, + "loss": 0.0224, + "macro_f1": 0.3333333432674408, + "num_tokens": 2556550.0, + "repeat_count": 0.0, + "routers_loss": 0.004798175301402807, + "skip_count": 0.0, + "step": 1582, + "text_loss": 0.6559903025627136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.436747872028177, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0009721712176624591, + "loss": 0.0242, + "macro_f1": 0.3333333432674408, + "num_tokens": 2559862.0, + "repeat_count": 0.0, + "routers_loss": 0.013764148578047752, + "skip_count": 0.0, + "step": 1584, + "text_loss": 0.2257535308599472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.446140299383622, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10986328125, + "learning_rate": 0.0009720693073845667, + "loss": 0.032, + "macro_f1": 0.5492662787437439, + "num_tokens": 2562766.0, + "repeat_count": 0.0, + "routers_loss": 0.01937069371342659, + "skip_count": 2.0, + "step": 1586, + "text_loss": 0.178413525223732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.455532726739067, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.150390625, + "learning_rate": 0.0009719672162088252, + "loss": 0.0306, + "macro_f1": 0.32098767161369324, + "num_tokens": 2566583.0, + "repeat_count": 1.0, + "routers_loss": 0.06224144622683525, + "skip_count": 0.0, + "step": 1588, + "text_loss": 0.3992367684841156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 7.464925154094511, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.185546875, + "learning_rate": 0.0009718649441743559, + "loss": 0.0239, + "macro_f1": 0.9449735879898071, + "num_tokens": 2569516.0, + "repeat_count": 2.0, + "routers_loss": 0.06937911361455917, + "skip_count": 4.0, + "step": 1590, + "text_loss": 0.1945122629404068 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.00097176249132035, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2572418.0, + "repeat_count": 0.0, + "routers_loss": 0.0034326619934290648, + "skip_count": 0.0, + "step": 1592, + "text_loss": 0.6259906888008118 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009716598576860676, + "loss": 0.0278, + "macro_f1": 0.6666666865348816, + "num_tokens": 2575235.0, + "repeat_count": 1.0, + "routers_loss": 0.004557516425848007, + "skip_count": 0.0, + "step": 1594, + "text_loss": 0.6638736724853516 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 7.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009715570433108378, + "loss": 0.0198, + "macro_f1": 1.0, + "num_tokens": 2578157.0, + "repeat_count": 1.0, + "routers_loss": 0.015363055281341076, + "skip_count": 1.0, + "step": 1596, + "text_loss": 0.6530464887619019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009714540482340595, + "loss": 0.0268, + "macro_f1": 0.6666666865348816, + "num_tokens": 2581801.0, + "repeat_count": 1.0, + "routers_loss": 0.01257144846022129, + "skip_count": 0.0, + "step": 1598, + "text_loss": 0.5916110277175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.5118872908717345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009713508724952006, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2585204.0, + "repeat_count": 0.0, + "routers_loss": 0.003175645601004362, + "skip_count": 0.0, + "step": 1600, + "text_loss": 0.27901601791381836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009712475161337981, + "loss": 0.0261, + "macro_f1": 0.3333333432674408, + "num_tokens": 2588286.0, + "repeat_count": 0.0, + "routers_loss": 0.004122321493923664, + "skip_count": 0.0, + "step": 1602, + "text_loss": 0.42420244216918945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009711439791894585, + "loss": 0.0341, + "macro_f1": 0.6666666865348816, + "num_tokens": 2591476.0, + "repeat_count": 0.0, + "routers_loss": 0.011215819045901299, + "skip_count": 1.0, + "step": 1604, + "text_loss": 0.5549933910369873 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.540064572938069, + "f1_execute": 0.9599999785423279, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.0703125, + "learning_rate": 0.0009710402617018574, + "loss": 0.0172, + "macro_f1": 0.8200000524520874, + "num_tokens": 2594336.0, + "repeat_count": 1.0, + "routers_loss": 0.02916567400097847, + "skip_count": 2.0, + "step": 1606, + "text_loss": 0.3263779282569885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009709363637107393, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 2597462.0, + "repeat_count": 0.0, + "routers_loss": 0.015897957608103752, + "skip_count": 1.0, + "step": 1608, + "text_loss": 0.20917139947414398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009708322852559184, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2601543.0, + "repeat_count": 0.0, + "routers_loss": 0.002211357234045863, + "skip_count": 0.0, + "step": 1610, + "text_loss": 0.450550377368927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.568241855004403, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.0009707280263772776, + "loss": 0.0277, + "macro_f1": 0.6666666865348816, + "num_tokens": 2604462.0, + "repeat_count": 0.0, + "routers_loss": 0.01615734025835991, + "skip_count": 2.0, + "step": 1612, + "text_loss": 0.6908381581306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.577634282359847, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009706235871147688, + "loss": 0.0241, + "macro_f1": 0.5492662787437439, + "num_tokens": 2607484.0, + "repeat_count": 0.0, + "routers_loss": 0.022048067301511765, + "skip_count": 2.0, + "step": 1614, + "text_loss": 0.36691340804100037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.587026709715292, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10546875, + "learning_rate": 0.0009705189675084138, + "loss": 0.0176, + "macro_f1": 0.6666666865348816, + "num_tokens": 2610204.0, + "repeat_count": 0.0, + "routers_loss": 0.008503952994942665, + "skip_count": 1.0, + "step": 1616, + "text_loss": 0.5226598381996155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.596419137070737, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009704141675983029, + "loss": 0.0248, + "macro_f1": 0.3333333432674408, + "num_tokens": 2613128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019020626787096262, + "skip_count": 0.0, + "step": 1618, + "text_loss": 0.6465088725090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5714285969734192, + "avg_layers": 24.0, + "epoch": 7.6058115644261814, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.0, + "f1_skip": 0.7272727489471436, + "grad_norm": 0.107421875, + "learning_rate": 0.0009703091874245956, + "loss": 0.032, + "macro_f1": 0.5535354018211365, + "num_tokens": 2616360.0, + "repeat_count": 0.0, + "routers_loss": 0.11837691068649292, + "skip_count": 7.0, + "step": 1620, + "text_loss": 0.2987039089202881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.615203991781626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009702040270275204, + "loss": 0.0181, + "macro_f1": 0.3333333432674408, + "num_tokens": 2619606.0, + "repeat_count": 0.0, + "routers_loss": 0.0065958453342318535, + "skip_count": 0.0, + "step": 1622, + "text_loss": 0.6262096166610718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.103515625, + "learning_rate": 0.000970098686447375, + "loss": 0.0257, + "macro_f1": 0.6666666865348816, + "num_tokens": 2622499.0, + "repeat_count": 0.0, + "routers_loss": 0.013632026500999928, + "skip_count": 1.0, + "step": 1624, + "text_loss": 0.2392602562904358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.633988846492516, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.125, + "learning_rate": 0.0009699931657245264, + "loss": 0.0245, + "macro_f1": 0.5492662787437439, + "num_tokens": 2626002.0, + "repeat_count": 0.0, + "routers_loss": 0.012147823348641396, + "skip_count": 2.0, + "step": 1626, + "text_loss": 0.4742976129055023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009698874648994098, + "loss": 0.0285, + "macro_f1": 1.0, + "num_tokens": 2629847.0, + "repeat_count": 1.0, + "routers_loss": 0.010692884214222431, + "skip_count": 3.0, + "step": 1628, + "text_loss": 0.5090685486793518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.6527737012034045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009697815840125304, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2633529.0, + "repeat_count": 0.0, + "routers_loss": 0.011442207731306553, + "skip_count": 0.0, + "step": 1630, + "text_loss": 0.1874329298734665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2119140625, + "learning_rate": 0.0009696755231044618, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2636321.0, + "repeat_count": 0.0, + "routers_loss": 0.0026681360322982073, + "skip_count": 0.0, + "step": 1632, + "text_loss": 0.7650400400161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.671558555914294, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.0009695692822158466, + "loss": 0.0242, + "macro_f1": 0.3272727429866791, + "num_tokens": 2638840.0, + "repeat_count": 1.0, + "routers_loss": 0.033965807408094406, + "skip_count": 0.0, + "step": 1634, + "text_loss": 0.6175784468650818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009694628613873968, + "loss": 0.018, + "macro_f1": 0.3333333432674408, + "num_tokens": 2641886.0, + "repeat_count": 0.0, + "routers_loss": 0.007568214554339647, + "skip_count": 0.0, + "step": 1636, + "text_loss": 0.43139931559562683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.690343410625183, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.193359375, + "learning_rate": 0.0009693562606598929, + "loss": 0.025, + "macro_f1": 0.3333333432674408, + "num_tokens": 2645028.0, + "repeat_count": 0.0, + "routers_loss": 0.004973865579813719, + "skip_count": 0.0, + "step": 1638, + "text_loss": 0.6430339217185974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009692494800741844, + "loss": 0.0313, + "macro_f1": 0.3272727429866791, + "num_tokens": 2648209.0, + "repeat_count": 1.0, + "routers_loss": 0.049863800406455994, + "skip_count": 0.0, + "step": 1640, + "text_loss": 0.28138160705566406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.709128265336073, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009691425196711901, + "loss": 0.0398, + "macro_f1": 0.3272727429866791, + "num_tokens": 2651171.0, + "repeat_count": 0.0, + "routers_loss": 0.02112230286002159, + "skip_count": 0.0, + "step": 1642, + "text_loss": 0.3745322525501251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.718520692691517, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009690353794918971, + "loss": 0.0275, + "macro_f1": 0.3333333432674408, + "num_tokens": 2654093.0, + "repeat_count": 0.0, + "routers_loss": 0.0024304776452481747, + "skip_count": 0.0, + "step": 1644, + "text_loss": 0.4275154173374176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000968928059577362, + "loss": 0.0244, + "macro_f1": 0.6666666865348816, + "num_tokens": 2657079.0, + "repeat_count": 0.0, + "routers_loss": 0.009320619516074657, + "skip_count": 1.0, + "step": 1646, + "text_loss": 0.46650025248527527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009688205599687099, + "loss": 0.0209, + "macro_f1": 0.3272727429866791, + "num_tokens": 2660951.0, + "repeat_count": 0.0, + "routers_loss": 0.011913162656128407, + "skip_count": 0.0, + "step": 1648, + "text_loss": 0.46644100546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7466979747578515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009687128807071347, + "loss": 0.0284, + "macro_f1": 0.3333333432674408, + "num_tokens": 2663823.0, + "repeat_count": 0.0, + "routers_loss": 0.013754756189882755, + "skip_count": 0.0, + "step": 1650, + "text_loss": 0.40808847546577454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.103515625, + "learning_rate": 0.0009686050218338996, + "loss": 0.0286, + "macro_f1": 0.3333333432674408, + "num_tokens": 2667079.0, + "repeat_count": 0.0, + "routers_loss": 0.009099726565182209, + "skip_count": 0.0, + "step": 1652, + "text_loss": 0.2389989197254181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009684969833903359, + "loss": 0.0283, + "macro_f1": 0.6666666865348816, + "num_tokens": 2670162.0, + "repeat_count": 0.0, + "routers_loss": 0.0034928603563457727, + "skip_count": 1.0, + "step": 1654, + "text_loss": 0.6930749416351318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.774875256824186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009683887654178445, + "loss": 0.0261, + "macro_f1": 0.6666666865348816, + "num_tokens": 2673031.0, + "repeat_count": 0.0, + "routers_loss": 0.008340462110936642, + "skip_count": 1.0, + "step": 1656, + "text_loss": 0.277752548456192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009682803679578947, + "loss": 0.0259, + "macro_f1": 0.3333333432674408, + "num_tokens": 2676092.0, + "repeat_count": 0.0, + "routers_loss": 0.004337446764111519, + "skip_count": 0.0, + "step": 1658, + "text_loss": 0.5176776051521301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.7936601115350745, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009681717910520244, + "loss": 0.0242, + "macro_f1": 0.32098764181137085, + "num_tokens": 2679479.0, + "repeat_count": 0.0, + "routers_loss": 0.034611742943525314, + "skip_count": 2.0, + "step": 1660, + "text_loss": 0.21485982835292816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009680630347418406, + "loss": 0.022, + "macro_f1": 0.5492662787437439, + "num_tokens": 2683289.0, + "repeat_count": 0.0, + "routers_loss": 0.03297121450304985, + "skip_count": 2.0, + "step": 1662, + "text_loss": 0.33801013231277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.812444966245964, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1728515625, + "learning_rate": 0.000967954099069019, + "loss": 0.0411, + "macro_f1": 0.32098764181137085, + "num_tokens": 2685879.0, + "repeat_count": 1.0, + "routers_loss": 0.04551183059811592, + "skip_count": 1.0, + "step": 1664, + "text_loss": 0.41123488545417786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.821837393601409, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009678449840753038, + "loss": 0.0324, + "macro_f1": 0.32098764181137085, + "num_tokens": 2688910.0, + "repeat_count": 0.0, + "routers_loss": 0.05866450071334839, + "skip_count": 2.0, + "step": 1666, + "text_loss": 0.1740892380475998 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009677356898025082, + "loss": 0.023, + "macro_f1": 0.3333333432674408, + "num_tokens": 2691680.0, + "repeat_count": 0.0, + "routers_loss": 0.009243223816156387, + "skip_count": 0.0, + "step": 1668, + "text_loss": 0.2512350380420685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.8406222483122985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.000967626216292514, + "loss": 0.0195, + "macro_f1": 0.3333333432674408, + "num_tokens": 2694895.0, + "repeat_count": 0.0, + "routers_loss": 0.005576452240347862, + "skip_count": 0.0, + "step": 1670, + "text_loss": 0.43294376134872437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 26.0, + "epoch": 7.850014675667743, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009675165635872715, + "loss": 0.0306, + "macro_f1": 0.44705885648727417, + "num_tokens": 2697806.0, + "repeat_count": 0.0, + "routers_loss": 0.05372785031795502, + "skip_count": 3.0, + "step": 1672, + "text_loss": 0.1614082306623459 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 7.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009674067317288, + "loss": 0.0296, + "macro_f1": 0.6666666865348816, + "num_tokens": 2700529.0, + "repeat_count": 1.0, + "routers_loss": 0.018131591379642487, + "skip_count": 0.0, + "step": 1674, + "text_loss": 0.2093173861503601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009672967207591869, + "loss": 0.0257, + "macro_f1": 0.3272727429866791, + "num_tokens": 2703650.0, + "repeat_count": 0.0, + "routers_loss": 0.0673515796661377, + "skip_count": 1.0, + "step": 1676, + "text_loss": 0.3029400110244751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 7.878191957734077, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009671865307205892, + "loss": 0.021, + "macro_f1": 0.32098767161369324, + "num_tokens": 2707615.0, + "repeat_count": 0.0, + "routers_loss": 0.03821169584989548, + "skip_count": 1.0, + "step": 1678, + "text_loss": 0.2262786477804184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 7.8875843850895215, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.1396484375, + "learning_rate": 0.0009670761616552315, + "loss": 0.0465, + "macro_f1": 0.9615669250488281, + "num_tokens": 2710894.0, + "repeat_count": 2.0, + "routers_loss": 0.042625464498996735, + "skip_count": 6.0, + "step": 1680, + "text_loss": 0.29623574018478394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.896976812444966, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.169921875, + "learning_rate": 0.0009669656136054074, + "loss": 0.0289, + "macro_f1": 0.3333333432674408, + "num_tokens": 2714330.0, + "repeat_count": 0.0, + "routers_loss": 0.0037571541033685207, + "skip_count": 0.0, + "step": 1682, + "text_loss": 0.7510389089584351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.906369239800411, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0009668548866134795, + "loss": 0.0256, + "macro_f1": 0.3333333432674408, + "num_tokens": 2717176.0, + "repeat_count": 0.0, + "routers_loss": 0.004142968449741602, + "skip_count": 0.0, + "step": 1684, + "text_loss": 0.3273485600948334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 7.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009667439807218783, + "loss": 0.0233, + "macro_f1": 0.6666666865348816, + "num_tokens": 2720628.0, + "repeat_count": 0.0, + "routers_loss": 0.008753842674195766, + "skip_count": 2.0, + "step": 1686, + "text_loss": 0.4314708709716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 7.9251540945113, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009666328959731033, + "loss": 0.0211, + "macro_f1": 0.6603773832321167, + "num_tokens": 2723739.0, + "repeat_count": 1.0, + "routers_loss": 0.022674910724163055, + "skip_count": 1.0, + "step": 1688, + "text_loss": 0.25734150409698486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 7.934546521866745, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009665216324097222, + "loss": 0.0324, + "macro_f1": 0.5934640765190125, + "num_tokens": 2726644.0, + "repeat_count": 0.0, + "routers_loss": 0.03932750225067139, + "skip_count": 3.0, + "step": 1690, + "text_loss": 0.24511034786701202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.94393894922219, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09765625, + "learning_rate": 0.0009664101900743714, + "loss": 0.0255, + "macro_f1": 0.3272727429866791, + "num_tokens": 2729662.0, + "repeat_count": 0.0, + "routers_loss": 0.012672754004597664, + "skip_count": 1.0, + "step": 1692, + "text_loss": 0.39431414008140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 7.953331376577634, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.076171875, + "learning_rate": 0.000966298569009756, + "loss": 0.0231, + "macro_f1": 0.5492662787437439, + "num_tokens": 2732578.0, + "repeat_count": 0.0, + "routers_loss": 0.01548632513731718, + "skip_count": 2.0, + "step": 1694, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.962723803933079, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009661867692586494, + "loss": 0.0153, + "macro_f1": 0.32098764181137085, + "num_tokens": 2735887.0, + "repeat_count": 0.0, + "routers_loss": 0.05622401833534241, + "skip_count": 2.0, + "step": 1696, + "text_loss": 0.29024389386177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.972116231288524, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0009660747908638933, + "loss": 0.0205, + "macro_f1": 0.3272727429866791, + "num_tokens": 2739293.0, + "repeat_count": 0.0, + "routers_loss": 0.041060201823711395, + "skip_count": 1.0, + "step": 1698, + "text_loss": 0.39461007714271545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.9815086586439685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1767578125, + "learning_rate": 0.0009659626338683981, + "loss": 0.0369, + "macro_f1": 0.3333333432674408, + "num_tokens": 2742468.0, + "repeat_count": 0.0, + "routers_loss": 0.007251353468745947, + "skip_count": 0.0, + "step": 1700, + "text_loss": 0.2751767635345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 7.990901085999413, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009658502983151427, + "loss": 0.0186, + "macro_f1": 0.3272727429866791, + "num_tokens": 2745123.0, + "repeat_count": 0.0, + "routers_loss": 0.012847424484789371, + "skip_count": 1.0, + "step": 1702, + "text_loss": 0.4756404757499695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009657377842471742, + "loss": 0.0313, + "macro_f1": 0.6666666865348816, + "num_tokens": 2748016.0, + "repeat_count": 0.0, + "routers_loss": 0.007060411386191845, + "skip_count": 1.0, + "step": 1704, + "text_loss": 0.9571210145950317 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.009392427355445, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10009765625, + "learning_rate": 0.0009656250917076081, + "loss": 0.0188, + "macro_f1": 0.5492662787437439, + "num_tokens": 2750717.0, + "repeat_count": 0.0, + "routers_loss": 0.016748681664466858, + "skip_count": 2.0, + "step": 1706, + "text_loss": 0.14542843401432037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009655122207396285, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2753635.0, + "repeat_count": 0.0, + "routers_loss": 0.013607042841613293, + "skip_count": 0.0, + "step": 1708, + "text_loss": 0.21836471557617188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009653991713864878, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2756643.0, + "repeat_count": 0.0, + "routers_loss": 0.0012097888393327594, + "skip_count": 0.0, + "step": 1710, + "text_loss": 0.635187029838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1171875, + "learning_rate": 0.0009652859436915066, + "loss": 0.0231, + "macro_f1": 0.3333333432674408, + "num_tokens": 2759432.0, + "repeat_count": 0.0, + "routers_loss": 0.006196760106831789, + "skip_count": 0.0, + "step": 1712, + "text_loss": 0.5629420876502991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009651725376980743, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 2762538.0, + "repeat_count": 0.0, + "routers_loss": 0.0042513771913945675, + "skip_count": 0.0, + "step": 1714, + "text_loss": 0.39522525668144226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 29.0, + "epoch": 8.056354564132668, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009650589534496479, + "loss": 0.0194, + "macro_f1": 0.8194444179534912, + "num_tokens": 2765571.0, + "repeat_count": 2.0, + "routers_loss": 0.03596706688404083, + "skip_count": 3.0, + "step": 1716, + "text_loss": 0.6252416968345642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009649451909897532, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 2769206.0, + "repeat_count": 0.0, + "routers_loss": 0.0025788163766264915, + "skip_count": 0.0, + "step": 1718, + "text_loss": 0.8851634860038757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.0009648312503619843, + "loss": 0.0265, + "macro_f1": 0.3333333432674408, + "num_tokens": 2772488.0, + "repeat_count": 0.0, + "routers_loss": 0.004443451762199402, + "skip_count": 0.0, + "step": 1720, + "text_loss": 0.8568580746650696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 8.084531846199003, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1552734375, + "learning_rate": 0.0009647171316100034, + "loss": 0.0265, + "macro_f1": 0.9265305995941162, + "num_tokens": 2776482.0, + "repeat_count": 1.0, + "routers_loss": 0.022948263213038445, + "skip_count": 3.0, + "step": 1722, + "text_loss": 0.13431036472320557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1572265625, + "learning_rate": 0.0009646028347775409, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 2778966.0, + "repeat_count": 0.0, + "routers_loss": 0.011328035034239292, + "skip_count": 1.0, + "step": 1724, + "text_loss": 0.2085491120815277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08984375, + "learning_rate": 0.0009644883599083958, + "loss": 0.0238, + "macro_f1": 0.3333333432674408, + "num_tokens": 2781968.0, + "repeat_count": 0.0, + "routers_loss": 0.002208018908277154, + "skip_count": 0.0, + "step": 1726, + "text_loss": 0.4948323965072632 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.112709128265337, + "f1_execute": 0.9411764740943909, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009643737070464349, + "loss": 0.0158, + "macro_f1": 0.6470588445663452, + "num_tokens": 2784666.0, + "repeat_count": 1.0, + "routers_loss": 0.04391832649707794, + "skip_count": 2.0, + "step": 1728, + "text_loss": 0.39060094952583313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009642588762355935, + "loss": 0.0212, + "macro_f1": 0.6666666865348816, + "num_tokens": 2787558.0, + "repeat_count": 0.0, + "routers_loss": 0.004497280344367027, + "skip_count": 1.0, + "step": 1730, + "text_loss": 0.34908708930015564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009641438675198748, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2790474.0, + "repeat_count": 0.0, + "routers_loss": 0.00583475548774004, + "skip_count": 0.0, + "step": 1732, + "text_loss": 0.5720033049583435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009640286809433508, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2793272.0, + "repeat_count": 0.0, + "routers_loss": 0.007826375775039196, + "skip_count": 0.0, + "step": 1734, + "text_loss": 0.32181721925735474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009639133165501606, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2797726.0, + "repeat_count": 0.0, + "routers_loss": 0.0019055595621466637, + "skip_count": 0.0, + "step": 1736, + "text_loss": 0.620936393737793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.134765625, + "learning_rate": 0.0009637977743845124, + "loss": 0.0229, + "macro_f1": 0.3333333432674408, + "num_tokens": 2800706.0, + "repeat_count": 0.0, + "routers_loss": 0.0028302327264100313, + "skip_count": 0.0, + "step": 1738, + "text_loss": 0.6473138332366943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009636820544906823, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 2803847.0, + "repeat_count": 1.0, + "routers_loss": 0.01105099730193615, + "skip_count": 2.0, + "step": 1740, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.178456119753449, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009635661569130141, + "loss": 0.0195, + "macro_f1": 0.5934640765190125, + "num_tokens": 2807235.0, + "repeat_count": 0.0, + "routers_loss": 0.02619045600295067, + "skip_count": 3.0, + "step": 1742, + "text_loss": 0.459264874458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009634500816959202, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2810396.0, + "repeat_count": 0.0, + "routers_loss": 0.007915694266557693, + "skip_count": 2.0, + "step": 1744, + "text_loss": 0.5084020495414734 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.1748046875, + "learning_rate": 0.0009633338288838805, + "loss": 0.0271, + "macro_f1": 0.5492662787437439, + "num_tokens": 2813215.0, + "repeat_count": 2.0, + "routers_loss": 0.08364596217870712, + "skip_count": 0.0, + "step": 1746, + "text_loss": 0.27681824564933777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 30.0, + "epoch": 8.206633401819783, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009632173985214438, + "loss": 0.0156, + "macro_f1": 0.8817967176437378, + "num_tokens": 2816452.0, + "repeat_count": 3.0, + "routers_loss": 0.028805451467633247, + "skip_count": 2.0, + "step": 1748, + "text_loss": 0.4678419530391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.216025829175228, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.000963100790653226, + "loss": 0.0188, + "macro_f1": 0.3272727429866791, + "num_tokens": 2819364.0, + "repeat_count": 0.0, + "routers_loss": 0.03056817688047886, + "skip_count": 1.0, + "step": 1750, + "text_loss": 0.3078109920024872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009629840053239116, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2823469.0, + "repeat_count": 0.0, + "routers_loss": 0.0019477814203128219, + "skip_count": 0.0, + "step": 1752, + "text_loss": 0.45501336455345154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.000962867042578253, + "loss": 0.0173, + "macro_f1": 0.3333333432674408, + "num_tokens": 2826716.0, + "repeat_count": 0.0, + "routers_loss": 0.0032963966950774193, + "skip_count": 0.0, + "step": 1754, + "text_loss": 0.49234694242477417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009627499024610707, + "loss": 0.0239, + "macro_f1": 0.3272727429866791, + "num_tokens": 2829733.0, + "repeat_count": 0.0, + "routers_loss": 0.010289114899933338, + "skip_count": 1.0, + "step": 1756, + "text_loss": 0.22335539758205414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009626325850172527, + "loss": 0.0174, + "macro_f1": 0.3272727429866791, + "num_tokens": 2833350.0, + "repeat_count": 0.0, + "routers_loss": 0.03249066323041916, + "skip_count": 1.0, + "step": 1758, + "text_loss": 0.6581931114196777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009625150902917555, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 2836558.0, + "repeat_count": 0.0, + "routers_loss": 0.00870000571012497, + "skip_count": 0.0, + "step": 1760, + "text_loss": 0.22938725352287292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009623974183296031, + "loss": 0.0192, + "macro_f1": 0.3333333432674408, + "num_tokens": 2840560.0, + "repeat_count": 0.0, + "routers_loss": 0.007767196744680405, + "skip_count": 0.0, + "step": 1762, + "text_loss": 0.24473799765110016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009622795691758876, + "loss": 0.0244, + "macro_f1": 0.3333333432674408, + "num_tokens": 2843548.0, + "repeat_count": 0.0, + "routers_loss": 0.0021693643648177385, + "skip_count": 0.0, + "step": 1764, + "text_loss": 0.3084608018398285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009621615428757693, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 2847076.0, + "repeat_count": 0.0, + "routers_loss": 0.0024727333802729845, + "skip_count": 0.0, + "step": 1766, + "text_loss": 0.5251734852790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.000962043339474476, + "loss": 0.0194, + "macro_f1": 0.3333333432674408, + "num_tokens": 2849751.0, + "repeat_count": 0.0, + "routers_loss": 0.005174890160560608, + "skip_count": 0.0, + "step": 1768, + "text_loss": 0.4410129189491272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009619249590173032, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 2853916.0, + "repeat_count": 0.0, + "routers_loss": 0.006785830482840538, + "skip_count": 2.0, + "step": 1770, + "text_loss": 0.550076425075531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.31934253008512, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009618064015496149, + "loss": 0.0192, + "macro_f1": 0.5934640765190125, + "num_tokens": 2857372.0, + "repeat_count": 0.0, + "routers_loss": 0.021370256319642067, + "skip_count": 3.0, + "step": 1772, + "text_loss": 0.1988629847764969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0009616876671168423, + "loss": 0.0162, + "macro_f1": 0.6666666865348816, + "num_tokens": 2861028.0, + "repeat_count": 0.0, + "routers_loss": 0.004313841462135315, + "skip_count": 1.0, + "step": 1774, + "text_loss": 0.42581331729888916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009615687557644847, + "loss": 0.0268, + "macro_f1": 0.3333333432674408, + "num_tokens": 2864847.0, + "repeat_count": 0.0, + "routers_loss": 0.0025742491707205772, + "skip_count": 0.0, + "step": 1776, + "text_loss": 0.46510905027389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009614496675381093, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 2867392.0, + "repeat_count": 0.0, + "routers_loss": 0.0016813480760902166, + "skip_count": 0.0, + "step": 1778, + "text_loss": 0.5922174453735352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009613304024833507, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 2871273.0, + "repeat_count": 0.0, + "routers_loss": 0.004948933608829975, + "skip_count": 0.0, + "step": 1780, + "text_loss": 0.6776977777481079 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009612109606459117, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 2874172.0, + "repeat_count": 1.0, + "routers_loss": 0.016950147226452827, + "skip_count": 2.0, + "step": 1782, + "text_loss": 0.48758944869041443 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.375697094217786, + "f1_execute": 0.9599999785423279, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009610913420715623, + "loss": 0.0237, + "macro_f1": 0.7644444704055786, + "num_tokens": 2877528.0, + "repeat_count": 2.0, + "routers_loss": 0.04880943149328232, + "skip_count": 1.0, + "step": 1784, + "text_loss": 0.4404778480529785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009609715468061411, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2880627.0, + "repeat_count": 0.0, + "routers_loss": 0.004678630735725164, + "skip_count": 0.0, + "step": 1786, + "text_loss": 0.7295402884483337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009608515748955535, + "loss": 0.0205, + "macro_f1": 0.3333333432674408, + "num_tokens": 2883333.0, + "repeat_count": 0.0, + "routers_loss": 0.0026695074047893286, + "skip_count": 0.0, + "step": 1788, + "text_loss": 0.9697831273078918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.40387437628412, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.107421875, + "learning_rate": 0.000960731426385773, + "loss": 0.0157, + "macro_f1": 0.4871794879436493, + "num_tokens": 2887444.0, + "repeat_count": 0.0, + "routers_loss": 0.029743613675236702, + "skip_count": 2.0, + "step": 1790, + "text_loss": 0.4737568199634552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0009606111013228407, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 2890221.0, + "repeat_count": 0.0, + "routers_loss": 0.0016153788892552257, + "skip_count": 0.0, + "step": 1792, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.422659230995011, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009604905997528655, + "loss": 0.02, + "macro_f1": 0.3272727429866791, + "num_tokens": 2893262.0, + "repeat_count": 0.0, + "routers_loss": 0.01965433731675148, + "skip_count": 1.0, + "step": 1794, + "text_loss": 0.45227760076522827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.432051658350455, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009603699217220239, + "loss": 0.0117, + "macro_f1": 0.6601307392120361, + "num_tokens": 2896823.0, + "repeat_count": 1.0, + "routers_loss": 0.024017298594117165, + "skip_count": 2.0, + "step": 1796, + "text_loss": 0.48865509033203125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009602490672765597, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 2899707.0, + "repeat_count": 0.0, + "routers_loss": 0.0012420224957168102, + "skip_count": 0.0, + "step": 1798, + "text_loss": 0.43292415142059326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07861328125, + "learning_rate": 0.0009601280364627848, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 2902795.0, + "repeat_count": 0.0, + "routers_loss": 0.0020389219280332327, + "skip_count": 0.0, + "step": 1800, + "text_loss": 0.41021591424942017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009600068293270783, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 2905769.0, + "repeat_count": 0.0, + "routers_loss": 0.002006303984671831, + "skip_count": 0.0, + "step": 1802, + "text_loss": 0.46892106533050537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08740234375, + "learning_rate": 0.000959885445915887, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 2909475.0, + "repeat_count": 0.0, + "routers_loss": 0.003734810510650277, + "skip_count": 0.0, + "step": 1804, + "text_loss": 0.45364710688591003 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 8.479013795127678, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009597638862757254, + "loss": 0.0182, + "macro_f1": 0.8823530077934265, + "num_tokens": 2914348.0, + "repeat_count": 1.0, + "routers_loss": 0.038971323519945145, + "skip_count": 2.0, + "step": 1806, + "text_loss": 0.42913779616355896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.488406222483123, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009596421504531751, + "loss": 0.0249, + "macro_f1": 0.3272727429866791, + "num_tokens": 2917467.0, + "repeat_count": 1.0, + "routers_loss": 0.04800829663872719, + "skip_count": 0.0, + "step": 1808, + "text_loss": 0.17332297563552856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1083984375, + "learning_rate": 0.0009595202384948858, + "loss": 0.0227, + "macro_f1": 0.6666666865348816, + "num_tokens": 2920223.0, + "repeat_count": 1.0, + "routers_loss": 0.009164143353700638, + "skip_count": 0.0, + "step": 1810, + "text_loss": 0.33740702271461487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009593981504475742, + "loss": 0.0275, + "macro_f1": 0.6666666865348816, + "num_tokens": 2923780.0, + "repeat_count": 0.0, + "routers_loss": 0.011236993595957756, + "skip_count": 2.0, + "step": 1812, + "text_loss": 0.1609916388988495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.516583504549457, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009592758863580248, + "loss": 0.0259, + "macro_f1": 0.5492662787437439, + "num_tokens": 2926259.0, + "repeat_count": 0.0, + "routers_loss": 0.019026532769203186, + "skip_count": 2.0, + "step": 1814, + "text_loss": 0.6460903882980347 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.525975931904902, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009591534462730894, + "loss": 0.0206, + "macro_f1": 0.5492662787437439, + "num_tokens": 2929173.0, + "repeat_count": 2.0, + "routers_loss": 0.0608333982527256, + "skip_count": 0.0, + "step": 1816, + "text_loss": 0.476126492023468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000959030830239687, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 2932703.0, + "repeat_count": 0.0, + "routers_loss": 0.0093300249427557, + "skip_count": 0.0, + "step": 1818, + "text_loss": 0.5471875667572021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.2001953125, + "learning_rate": 0.0009589080383048048, + "loss": 0.0235, + "macro_f1": 0.3333333432674408, + "num_tokens": 2936195.0, + "repeat_count": 0.0, + "routers_loss": 0.010434109717607498, + "skip_count": 0.0, + "step": 1820, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009587850705154964, + "loss": 0.0291, + "macro_f1": 0.3333333432674408, + "num_tokens": 2939412.0, + "repeat_count": 0.0, + "routers_loss": 0.004347751382738352, + "skip_count": 0.0, + "step": 1822, + "text_loss": 0.4241984784603119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.56354564132668, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0859375, + "learning_rate": 0.0009586619269188836, + "loss": 0.0224, + "macro_f1": 0.32098767161369324, + "num_tokens": 2942318.0, + "repeat_count": 0.0, + "routers_loss": 0.034238871186971664, + "skip_count": 1.0, + "step": 1824, + "text_loss": 0.2328975349664688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009585386075621553, + "loss": 0.027, + "macro_f1": 0.3333333432674408, + "num_tokens": 2945731.0, + "repeat_count": 0.0, + "routers_loss": 0.006097695790231228, + "skip_count": 0.0, + "step": 1826, + "text_loss": 0.22816994786262512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.582330496037569, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009584151124925676, + "loss": 0.0208, + "macro_f1": 0.3272727429866791, + "num_tokens": 2948944.0, + "repeat_count": 0.0, + "routers_loss": 0.007790776435285807, + "skip_count": 1.0, + "step": 1828, + "text_loss": 0.5009413361549377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009582914417574438, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 2951723.0, + "repeat_count": 0.0, + "routers_loss": 0.009144559502601624, + "skip_count": 2.0, + "step": 1830, + "text_loss": 0.1402502954006195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 8.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009581675954041751, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 2954726.0, + "repeat_count": 1.0, + "routers_loss": 0.006593191530555487, + "skip_count": 0.0, + "step": 1832, + "text_loss": 0.4871736466884613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009580435734802196, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 2957853.0, + "repeat_count": 0.0, + "routers_loss": 0.01241068821400404, + "skip_count": 0.0, + "step": 1834, + "text_loss": 0.30100154876708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009579193760331027, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 2960783.0, + "repeat_count": 0.0, + "routers_loss": 0.002219218760728836, + "skip_count": 0.0, + "step": 1836, + "text_loss": 0.4961516559123993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.629292632814794, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009577950031104169, + "loss": 0.0166, + "macro_f1": 0.6601307392120361, + "num_tokens": 2963328.0, + "repeat_count": 1.0, + "routers_loss": 0.029363535344600677, + "skip_count": 2.0, + "step": 1838, + "text_loss": 0.42814353108406067 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 8.638685060170237, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009576704547598226, + "loss": 0.0257, + "macro_f1": 0.7795917987823486, + "num_tokens": 2966108.0, + "repeat_count": 1.0, + "routers_loss": 0.0579402856528759, + "skip_count": 4.0, + "step": 1840, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009575457310290463, + "loss": 0.0121, + "macro_f1": 0.3272727429866791, + "num_tokens": 2969137.0, + "repeat_count": 0.0, + "routers_loss": 0.008810589089989662, + "skip_count": 0.0, + "step": 1842, + "text_loss": 0.6199528574943542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009574208319658831, + "loss": 0.0208, + "macro_f1": 0.6666666865348816, + "num_tokens": 2972407.0, + "repeat_count": 0.0, + "routers_loss": 0.0012295129708945751, + "skip_count": 1.0, + "step": 1844, + "text_loss": 0.66938316822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 8.666862342236572, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.1474609375, + "learning_rate": 0.000957295757618194, + "loss": 0.0152, + "macro_f1": 0.4871794879436493, + "num_tokens": 2976045.0, + "repeat_count": 0.0, + "routers_loss": 0.06162935495376587, + "skip_count": 2.0, + "step": 1846, + "text_loss": 0.5381782650947571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009571705080339079, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 2979025.0, + "repeat_count": 0.0, + "routers_loss": 0.003950524143874645, + "skip_count": 0.0, + "step": 1848, + "text_loss": 0.5831671357154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009570450832610208, + "loss": 0.0209, + "macro_f1": 0.3333333432674408, + "num_tokens": 2982276.0, + "repeat_count": 0.0, + "routers_loss": 0.010354886762797832, + "skip_count": 0.0, + "step": 1850, + "text_loss": 0.27448201179504395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 8.695039624302906, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009569194833475956, + "loss": 0.0199, + "macro_f1": 0.3272727429866791, + "num_tokens": 2985691.0, + "repeat_count": 0.0, + "routers_loss": 0.010167439468204975, + "skip_count": 0.0, + "step": 1852, + "text_loss": 0.5264663696289062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.704432051658351, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1328125, + "learning_rate": 0.0009567937083417624, + "loss": 0.0194, + "macro_f1": 0.3272727429866791, + "num_tokens": 2989126.0, + "repeat_count": 0.0, + "routers_loss": 0.0371871180832386, + "skip_count": 1.0, + "step": 1854, + "text_loss": 0.2008018046617508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009566677582917185, + "loss": 0.0184, + "macro_f1": 0.3333333432674408, + "num_tokens": 2992814.0, + "repeat_count": 0.0, + "routers_loss": 0.010190588422119617, + "skip_count": 0.0, + "step": 1856, + "text_loss": 0.749717116355896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.72321690636924, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009565416332457282, + "loss": 0.0132, + "macro_f1": 0.6538461446762085, + "num_tokens": 2995729.0, + "repeat_count": 1.0, + "routers_loss": 0.022285036742687225, + "skip_count": 1.0, + "step": 1858, + "text_loss": 0.5870219469070435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.732609333724685, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009564153332521228, + "loss": 0.0224, + "macro_f1": 0.3272727429866791, + "num_tokens": 2998812.0, + "repeat_count": 0.0, + "routers_loss": 0.011050296947360039, + "skip_count": 1.0, + "step": 1860, + "text_loss": 0.8444408774375916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009562888583593005, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3001799.0, + "repeat_count": 0.0, + "routers_loss": 0.007125461008399725, + "skip_count": 0.0, + "step": 1862, + "text_loss": 0.41510361433029175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009561622086157272, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3005088.0, + "repeat_count": 0.0, + "routers_loss": 0.0049054501578211784, + "skip_count": 0.0, + "step": 1864, + "text_loss": 0.3801248073577881 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.000956035384069935, + "loss": 0.0238, + "macro_f1": 1.0, + "num_tokens": 3008178.0, + "repeat_count": 1.0, + "routers_loss": 0.005162427201867104, + "skip_count": 1.0, + "step": 1866, + "text_loss": 0.2687684893608093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009559083847705233, + "loss": 0.0214, + "macro_f1": 0.3272727429866791, + "num_tokens": 3010923.0, + "repeat_count": 0.0, + "routers_loss": 0.028984658420085907, + "skip_count": 1.0, + "step": 1868, + "text_loss": 0.6277349591255188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009557812107661584, + "loss": 0.0208, + "macro_f1": 1.0, + "num_tokens": 3015030.0, + "repeat_count": 1.0, + "routers_loss": 0.012200530618429184, + "skip_count": 1.0, + "step": 1870, + "text_loss": 0.6293368339538574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.788963897857352, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009556538621055739, + "loss": 0.0268, + "macro_f1": 0.3272727429866791, + "num_tokens": 3019067.0, + "repeat_count": 0.0, + "routers_loss": 0.06365182995796204, + "skip_count": 1.0, + "step": 1872, + "text_loss": 0.39046618342399597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 8.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.115234375, + "learning_rate": 0.0009555263388375699, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3022166.0, + "repeat_count": 0.0, + "routers_loss": 0.0041703456081449986, + "skip_count": 1.0, + "step": 1874, + "text_loss": 0.42232340574264526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009553986410110134, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3025865.0, + "repeat_count": 0.0, + "routers_loss": 0.005841755773872137, + "skip_count": 0.0, + "step": 1876, + "text_loss": 0.37600573897361755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.817141179923686, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09228515625, + "learning_rate": 0.0009552707686748388, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3029950.0, + "repeat_count": 0.0, + "routers_loss": 0.05165952071547508, + "skip_count": 1.0, + "step": 1878, + "text_loss": 0.33717799186706543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009551427218780467, + "loss": 0.0219, + "macro_f1": 0.6666666865348816, + "num_tokens": 3033649.0, + "repeat_count": 0.0, + "routers_loss": 0.020680008456110954, + "skip_count": 2.0, + "step": 1880, + "text_loss": 0.5011783838272095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.835926034634575, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.15625, + "learning_rate": 0.0009550145006697048, + "loss": 0.0217, + "macro_f1": 0.32098764181137085, + "num_tokens": 3036847.0, + "repeat_count": 0.0, + "routers_loss": 0.07626450061798096, + "skip_count": 2.0, + "step": 1882, + "text_loss": 0.3066408336162567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009548861050989482, + "loss": 0.0136, + "macro_f1": 1.0, + "num_tokens": 3040353.0, + "repeat_count": 1.0, + "routers_loss": 0.010884666815400124, + "skip_count": 1.0, + "step": 1884, + "text_loss": 0.49779415130615234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009547575352149778, + "loss": 0.0213, + "macro_f1": 0.6666666865348816, + "num_tokens": 3043504.0, + "repeat_count": 0.0, + "routers_loss": 0.006704333238303661, + "skip_count": 2.0, + "step": 1886, + "text_loss": 0.12284614145755768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 8.86410331670091, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11474609375, + "learning_rate": 0.0009546287910670621, + "loss": 0.0211, + "macro_f1": 0.5427350401878357, + "num_tokens": 3046422.0, + "repeat_count": 1.0, + "routers_loss": 0.04799000173807144, + "skip_count": 2.0, + "step": 1888, + "text_loss": 0.1824081838130951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1484375, + "learning_rate": 0.0009544998727045361, + "loss": 0.0306, + "macro_f1": 0.3333333432674408, + "num_tokens": 3049819.0, + "repeat_count": 0.0, + "routers_loss": 0.008139612153172493, + "skip_count": 0.0, + "step": 1890, + "text_loss": 0.18929053843021393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 8.8828881714118, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.09375, + "learning_rate": 0.0009543707801768015, + "loss": 0.0175, + "macro_f1": 0.5934640765190125, + "num_tokens": 3052766.0, + "repeat_count": 0.0, + "routers_loss": 0.02966771461069584, + "skip_count": 3.0, + "step": 1892, + "text_loss": 0.247748002409935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 25.0, + "epoch": 8.892280598767243, + "f1_execute": 0.9411764740943909, + "f1_repeat": 0.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009542415135333267, + "loss": 0.0193, + "macro_f1": 0.44705885648727417, + "num_tokens": 3056427.0, + "repeat_count": 0.0, + "routers_loss": 0.03637036308646202, + "skip_count": 2.0, + "step": 1894, + "text_loss": 0.2583999037742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009541120728236472, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3059497.0, + "repeat_count": 0.0, + "routers_loss": 0.007026574574410915, + "skip_count": 0.0, + "step": 1896, + "text_loss": 0.5222375988960266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009539824580973646, + "loss": 0.0219, + "macro_f1": 0.3333333432674408, + "num_tokens": 3062187.0, + "repeat_count": 0.0, + "routers_loss": 0.003449335927143693, + "skip_count": 0.0, + "step": 1898, + "text_loss": 0.5736427307128906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009538526694041477, + "loss": 0.0163, + "macro_f1": 0.3333333432674408, + "num_tokens": 3066100.0, + "repeat_count": 0.0, + "routers_loss": 0.0035463871899992228, + "skip_count": 0.0, + "step": 1900, + "text_loss": 0.5471583604812622 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 8.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009537227067937318, + "loss": 0.0233, + "macro_f1": 1.0, + "num_tokens": 3068737.0, + "repeat_count": 3.0, + "routers_loss": 0.00597514258697629, + "skip_count": 3.0, + "step": 1902, + "text_loss": 0.36644190549850464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.939242735544468, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.166015625, + "learning_rate": 0.0009535925703159186, + "loss": 0.0301, + "macro_f1": 0.32098764181137085, + "num_tokens": 3071686.0, + "repeat_count": 0.0, + "routers_loss": 0.025420479476451874, + "skip_count": 2.0, + "step": 1904, + "text_loss": 0.535789966583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009534622600205769, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3074954.0, + "repeat_count": 0.0, + "routers_loss": 0.014377486892044544, + "skip_count": 0.0, + "step": 1906, + "text_loss": 0.19009549915790558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0009533317759576416, + "loss": 0.0197, + "macro_f1": 0.3333333432674408, + "num_tokens": 3077540.0, + "repeat_count": 0.0, + "routers_loss": 0.004848944488912821, + "skip_count": 0.0, + "step": 1908, + "text_loss": 0.5022001266479492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 8.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009532011181771148, + "loss": 0.0217, + "macro_f1": 0.6666666865348816, + "num_tokens": 3080445.0, + "repeat_count": 0.0, + "routers_loss": 0.009480170905590057, + "skip_count": 2.0, + "step": 1910, + "text_loss": 0.35135936737060547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0009530702867290644, + "loss": 0.0185, + "macro_f1": 0.3333333432674408, + "num_tokens": 3083657.0, + "repeat_count": 0.0, + "routers_loss": 0.0019353039097040892, + "skip_count": 0.0, + "step": 1912, + "text_loss": 0.5123994946479797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009529392816636256, + "loss": 0.0249, + "macro_f1": 0.3333333432674408, + "num_tokens": 3086837.0, + "repeat_count": 0.0, + "routers_loss": 0.0010921972570940852, + "skip_count": 0.0, + "step": 1914, + "text_loss": 0.44477662444114685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 8.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.19140625, + "learning_rate": 0.0009528081030309995, + "loss": 0.0351, + "macro_f1": 0.3333333432674408, + "num_tokens": 3089892.0, + "repeat_count": 0.0, + "routers_loss": 0.0018027103506028652, + "skip_count": 0.0, + "step": 1916, + "text_loss": 0.7356183528900146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009526767508814542, + "loss": 0.0236, + "macro_f1": 0.3333333432674408, + "num_tokens": 3093058.0, + "repeat_count": 0.0, + "routers_loss": 0.003243023296818137, + "skip_count": 0.0, + "step": 1918, + "text_loss": 0.48823556303977966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009525452252653239, + "loss": 0.0175, + "macro_f1": 0.3333333432674408, + "num_tokens": 3096404.0, + "repeat_count": 0.0, + "routers_loss": 0.009360014460980892, + "skip_count": 0.0, + "step": 1920, + "text_loss": 0.21498437225818634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.023481068388612, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.140625, + "learning_rate": 0.0009524135262330098, + "loss": 0.0224, + "macro_f1": 0.9265305995941162, + "num_tokens": 3099520.0, + "repeat_count": 1.0, + "routers_loss": 0.017444295808672905, + "skip_count": 3.0, + "step": 1922, + "text_loss": 0.27608850598335266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.032873495744056, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009522816538349789, + "loss": 0.0162, + "macro_f1": 0.5492662787437439, + "num_tokens": 3102956.0, + "repeat_count": 0.0, + "routers_loss": 0.06424452364444733, + "skip_count": 2.0, + "step": 1924, + "text_loss": 0.21558666229248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009521496081217651, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3106565.0, + "repeat_count": 1.0, + "routers_loss": 0.002270506462082267, + "skip_count": 0.0, + "step": 1926, + "text_loss": 0.5641813278198242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009520173891439684, + "loss": 0.0216, + "macro_f1": 0.6666666865348816, + "num_tokens": 3109314.0, + "repeat_count": 0.0, + "routers_loss": 0.011512448079884052, + "skip_count": 1.0, + "step": 1928, + "text_loss": 0.6351624727249146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009518849969522556, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 3112956.0, + "repeat_count": 0.0, + "routers_loss": 0.003883908037096262, + "skip_count": 0.0, + "step": 1930, + "text_loss": 0.35160085558891296 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009517524315973595, + "loss": 0.019, + "macro_f1": 1.0, + "num_tokens": 3115593.0, + "repeat_count": 1.0, + "routers_loss": 0.009479222819209099, + "skip_count": 3.0, + "step": 1932, + "text_loss": 0.2900560200214386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009516196931300794, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3118516.0, + "repeat_count": 0.0, + "routers_loss": 0.017834696918725967, + "skip_count": 2.0, + "step": 1934, + "text_loss": 0.20094378292560577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12890625, + "learning_rate": 0.0009514867816012809, + "loss": 0.02, + "macro_f1": 0.3333333432674408, + "num_tokens": 3122242.0, + "repeat_count": 0.0, + "routers_loss": 0.0017964740982279181, + "skip_count": 0.0, + "step": 1936, + "text_loss": 0.6498590707778931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0009513536970618961, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3125645.0, + "repeat_count": 0.0, + "routers_loss": 0.007437168620526791, + "skip_count": 2.0, + "step": 1938, + "text_loss": 0.25863033533096313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009512204395629232, + "loss": 0.0184, + "macro_f1": 0.6666666865348816, + "num_tokens": 3128740.0, + "repeat_count": 0.0, + "routers_loss": 0.0008759932243265212, + "skip_count": 1.0, + "step": 1940, + "text_loss": 0.5638351440429688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.117405341943059, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009510870091554264, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3131742.0, + "repeat_count": 1.0, + "routers_loss": 0.019906625151634216, + "skip_count": 0.0, + "step": 1942, + "text_loss": 0.8410717844963074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009509534058905369, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3134407.0, + "repeat_count": 0.0, + "routers_loss": 0.0009229081333614886, + "skip_count": 0.0, + "step": 1944, + "text_loss": 0.47506049275398254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009508196298194517, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3137053.0, + "repeat_count": 0.0, + "routers_loss": 0.003630586201325059, + "skip_count": 0.0, + "step": 1946, + "text_loss": 0.32225799560546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009506856809934338, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 3140943.0, + "repeat_count": 0.0, + "routers_loss": 0.007580445148050785, + "skip_count": 0.0, + "step": 1948, + "text_loss": 0.3120577931404114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009505515594638127, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3144298.0, + "repeat_count": 0.0, + "routers_loss": 0.004471861757338047, + "skip_count": 0.0, + "step": 1950, + "text_loss": 0.22052447497844696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 9.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.0009504172652819843, + "loss": 0.023, + "macro_f1": 1.0, + "num_tokens": 3147069.0, + "repeat_count": 1.0, + "routers_loss": 0.009606664068996906, + "skip_count": 1.0, + "step": 1952, + "text_loss": 0.34773921966552734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009502827984994099, + "loss": 0.0148, + "macro_f1": 0.6666666865348816, + "num_tokens": 3149992.0, + "repeat_count": 0.0, + "routers_loss": 0.006443799939006567, + "skip_count": 1.0, + "step": 1954, + "text_loss": 0.6442171335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009501481591676177, + "loss": 0.0188, + "macro_f1": 0.3333333432674408, + "num_tokens": 3153167.0, + "repeat_count": 0.0, + "routers_loss": 0.003219039412215352, + "skip_count": 0.0, + "step": 1956, + "text_loss": 0.43369221687316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.192544760786616, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000950013347338202, + "loss": 0.0152, + "macro_f1": 0.3272727429866791, + "num_tokens": 3156590.0, + "repeat_count": 0.0, + "routers_loss": 0.025551019236445427, + "skip_count": 1.0, + "step": 1958, + "text_loss": 0.294479101896286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009498783630628225, + "loss": 0.0158, + "macro_f1": 1.0, + "num_tokens": 3159451.0, + "repeat_count": 1.0, + "routers_loss": 0.013802438974380493, + "skip_count": 2.0, + "step": 1960, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.211329615497505, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009497432063932057, + "loss": 0.0137, + "macro_f1": 0.6601307392120361, + "num_tokens": 3162889.0, + "repeat_count": 1.0, + "routers_loss": 0.02852988988161087, + "skip_count": 2.0, + "step": 1962, + "text_loss": 0.5027125477790833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009496078773811437, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 3165979.0, + "repeat_count": 0.0, + "routers_loss": 0.01784522272646427, + "skip_count": 2.0, + "step": 1964, + "text_loss": 0.1696339100599289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000949472376078495, + "loss": 0.016, + "macro_f1": 0.3333333432674408, + "num_tokens": 3168683.0, + "repeat_count": 0.0, + "routers_loss": 0.0017019887454807758, + "skip_count": 0.0, + "step": 1966, + "text_loss": 0.48905447125434875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000949336702537184, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 3171968.0, + "repeat_count": 0.0, + "routers_loss": 0.004817947279661894, + "skip_count": 2.0, + "step": 1968, + "text_loss": 0.20984773337841034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009492008568092007, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3175947.0, + "repeat_count": 0.0, + "routers_loss": 0.0012963006738573313, + "skip_count": 0.0, + "step": 1970, + "text_loss": 0.5215106010437012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 9.258291752274728, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.044921875, + "learning_rate": 0.0009490648389466019, + "loss": 0.0135, + "macro_f1": 0.4871794879436493, + "num_tokens": 3179348.0, + "repeat_count": 0.0, + "routers_loss": 0.03950481489300728, + "skip_count": 2.0, + "step": 1972, + "text_loss": 0.24640929698944092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09326171875, + "learning_rate": 0.0009489286490015097, + "loss": 0.0183, + "macro_f1": 0.6666666865348816, + "num_tokens": 3182640.0, + "repeat_count": 0.0, + "routers_loss": 0.0043345349840819836, + "skip_count": 2.0, + "step": 1974, + "text_loss": 0.6362852454185486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009487922870261122, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3185657.0, + "repeat_count": 0.0, + "routers_loss": 0.0015687479171901941, + "skip_count": 0.0, + "step": 1976, + "text_loss": 0.8977144360542297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009486557530726638, + "loss": 0.0139, + "macro_f1": 0.3333333432674408, + "num_tokens": 3188772.0, + "repeat_count": 0.0, + "routers_loss": 0.0010977238416671753, + "skip_count": 0.0, + "step": 1978, + "text_loss": 0.38512736558914185 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009485190471934844, + "loss": 0.0196, + "macro_f1": 0.6666666865348816, + "num_tokens": 3193131.0, + "repeat_count": 2.0, + "routers_loss": 0.002264744369313121, + "skip_count": 0.0, + "step": 1980, + "text_loss": 0.4171289801597595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.305253889051952, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09033203125, + "learning_rate": 0.00094838216944096, + "loss": 0.0219, + "macro_f1": 0.3272727429866791, + "num_tokens": 3196668.0, + "repeat_count": 0.0, + "routers_loss": 0.042320676147937775, + "skip_count": 1.0, + "step": 1982, + "text_loss": 0.19008000195026398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.314646316407396, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009482451198675424, + "loss": 0.0151, + "macro_f1": 0.32098767161369324, + "num_tokens": 3200282.0, + "repeat_count": 0.0, + "routers_loss": 0.01796630397439003, + "skip_count": 1.0, + "step": 1984, + "text_loss": 0.5009249448776245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009481078985257494, + "loss": 0.0147, + "macro_f1": 0.6666666865348816, + "num_tokens": 3204439.0, + "repeat_count": 0.0, + "routers_loss": 0.01052347756922245, + "skip_count": 1.0, + "step": 1986, + "text_loss": 0.15319275856018066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.333431171118287, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009479705054681644, + "loss": 0.015, + "macro_f1": 0.3076923191547394, + "num_tokens": 3207590.0, + "repeat_count": 1.0, + "routers_loss": 0.09640293568372726, + "skip_count": 3.0, + "step": 1988, + "text_loss": 0.3654652535915375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.34282359847373, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009478329407474366, + "loss": 0.0183, + "macro_f1": 0.5492662787437439, + "num_tokens": 3211172.0, + "repeat_count": 0.0, + "routers_loss": 0.012670112773776054, + "skip_count": 1.0, + "step": 1990, + "text_loss": 0.5817596316337585 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.000947695204416281, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 3214050.0, + "repeat_count": 1.0, + "routers_loss": 0.005263707600533962, + "skip_count": 0.0, + "step": 1992, + "text_loss": 0.5985888242721558 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.361608453184619, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009475572965274787, + "loss": 0.0144, + "macro_f1": 0.3272727429866791, + "num_tokens": 3217318.0, + "repeat_count": 1.0, + "routers_loss": 0.0682850033044815, + "skip_count": 0.0, + "step": 1994, + "text_loss": 0.316506564617157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.000947419217133876, + "loss": 0.019, + "macro_f1": 0.6666666865348816, + "num_tokens": 3220012.0, + "repeat_count": 0.0, + "routers_loss": 0.008508823812007904, + "skip_count": 2.0, + "step": 1996, + "text_loss": 0.09665893763303757 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009472809662883852, + "loss": 0.0155, + "macro_f1": 1.0, + "num_tokens": 3223019.0, + "repeat_count": 1.0, + "routers_loss": 0.01100847590714693, + "skip_count": 2.0, + "step": 1998, + "text_loss": 0.4938808083534241 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.389785735250953, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009471425440439844, + "loss": 0.0135, + "macro_f1": 0.8817967176437378, + "num_tokens": 3226013.0, + "repeat_count": 2.0, + "routers_loss": 0.04953207075595856, + "skip_count": 3.0, + "step": 2000, + "text_loss": 0.22258254885673523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 9.399178162606399, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009470039504537173, + "loss": 0.0186, + "macro_f1": 0.31446540355682373, + "num_tokens": 3230031.0, + "repeat_count": 0.0, + "routers_loss": 0.052884332835674286, + "skip_count": 2.0, + "step": 2002, + "text_loss": 0.1741616576910019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009468651855706931, + "loss": 0.0204, + "macro_f1": 0.6666666865348816, + "num_tokens": 3232991.0, + "repeat_count": 1.0, + "routers_loss": 0.008056716993451118, + "skip_count": 0.0, + "step": 2004, + "text_loss": 0.3173636198043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009467262494480868, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3236390.0, + "repeat_count": 0.0, + "routers_loss": 0.0053409393876791, + "skip_count": 0.0, + "step": 2006, + "text_loss": 0.5806330442428589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.000946587142139139, + "loss": 0.0147, + "macro_f1": 0.3333333432674408, + "num_tokens": 3239267.0, + "repeat_count": 0.0, + "routers_loss": 0.0015652200672775507, + "skip_count": 0.0, + "step": 2008, + "text_loss": 0.6214317679405212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.436747872028178, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.000946447863697156, + "loss": 0.0151, + "macro_f1": 0.6601307392120361, + "num_tokens": 3242569.0, + "repeat_count": 1.0, + "routers_loss": 0.011673987843096256, + "skip_count": 2.0, + "step": 2010, + "text_loss": 0.532565712928772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009463084141755093, + "loss": 0.0159, + "macro_f1": 0.3272727429866791, + "num_tokens": 3245669.0, + "repeat_count": 0.0, + "routers_loss": 0.028480790555477142, + "skip_count": 1.0, + "step": 2012, + "text_loss": 0.25210800766944885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009461687936276364, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3248751.0, + "repeat_count": 0.0, + "routers_loss": 0.007234727032482624, + "skip_count": 0.0, + "step": 2014, + "text_loss": 0.35922971367836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 9.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009460290021070402, + "loss": 0.0195, + "macro_f1": 0.6666666865348816, + "num_tokens": 3252614.0, + "repeat_count": 1.0, + "routers_loss": 0.014691276475787163, + "skip_count": 0.0, + "step": 2016, + "text_loss": 0.2747853398323059 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009458890396672888, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3256374.0, + "repeat_count": 0.0, + "routers_loss": 0.002385235857218504, + "skip_count": 0.0, + "step": 2018, + "text_loss": 0.5268719792366028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 9.483710008805401, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009457489063620164, + "loss": 0.0133, + "macro_f1": 0.8823530077934265, + "num_tokens": 3259792.0, + "repeat_count": 1.0, + "routers_loss": 0.047268565744161606, + "skip_count": 2.0, + "step": 2020, + "text_loss": 0.7785539627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.493102436160845, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009456086022449221, + "loss": 0.0218, + "macro_f1": 0.3272727429866791, + "num_tokens": 3262833.0, + "repeat_count": 0.0, + "routers_loss": 0.015878718346357346, + "skip_count": 1.0, + "step": 2022, + "text_loss": 0.42270028591156006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.50249486351629, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08935546875, + "learning_rate": 0.0009454681273697711, + "loss": 0.0117, + "macro_f1": 0.3272727429866791, + "num_tokens": 3265718.0, + "repeat_count": 1.0, + "routers_loss": 0.030749641358852386, + "skip_count": 0.0, + "step": 2024, + "text_loss": 0.18668225407600403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009453274817903931, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3268158.0, + "repeat_count": 0.0, + "routers_loss": 0.011538166552782059, + "skip_count": 1.0, + "step": 2026, + "text_loss": 0.34090787172317505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.000945186665560684, + "loss": 0.0218, + "macro_f1": 0.3333333432674408, + "num_tokens": 3271082.0, + "repeat_count": 0.0, + "routers_loss": 0.009527760557830334, + "skip_count": 0.0, + "step": 2028, + "text_loss": 0.2110334187746048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.530672145582624, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.119140625, + "learning_rate": 0.000945045678734605, + "loss": 0.0175, + "macro_f1": 0.3144654333591461, + "num_tokens": 3273488.0, + "repeat_count": 0.0, + "routers_loss": 0.03317151218652725, + "skip_count": 3.0, + "step": 2030, + "text_loss": 0.2233227640390396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.540064572938068, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009449045213661822, + "loss": 0.0201, + "macro_f1": 0.3272727429866791, + "num_tokens": 3276646.0, + "repeat_count": 0.0, + "routers_loss": 0.018510591238737106, + "skip_count": 1.0, + "step": 2032, + "text_loss": 0.16100332140922546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.549457000293513, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009447631935095077, + "loss": 0.0185, + "macro_f1": 0.9452888369560242, + "num_tokens": 3279441.0, + "repeat_count": 1.0, + "routers_loss": 0.028113311156630516, + "skip_count": 4.0, + "step": 2034, + "text_loss": 0.29208317399024963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009446216952187384, + "loss": 0.0164, + "macro_f1": 0.3333333432674408, + "num_tokens": 3282697.0, + "repeat_count": 0.0, + "routers_loss": 0.008379172533750534, + "skip_count": 0.0, + "step": 2036, + "text_loss": 0.16026398539543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009444800265480967, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3285574.0, + "repeat_count": 0.0, + "routers_loss": 0.00941354501992464, + "skip_count": 0.0, + "step": 2038, + "text_loss": 0.29523080587387085 + }, + { + "acc_repeat": 0.75, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.577634282359847, + "f1_execute": 0.9230769276618958, + "f1_repeat": 0.8571428656578064, + "f1_skip": 0.800000011920929, + "grad_norm": 0.076171875, + "learning_rate": 0.0009443381875518703, + "loss": 0.0197, + "macro_f1": 0.8600732684135437, + "num_tokens": 3289159.0, + "repeat_count": 4.0, + "routers_loss": 0.04974055662751198, + "skip_count": 6.0, + "step": 2040, + "text_loss": 0.23033179342746735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.587026709715293, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009441961782844123, + "loss": 0.0146, + "macro_f1": 0.3272727429866791, + "num_tokens": 3293598.0, + "repeat_count": 0.0, + "routers_loss": 0.022241825237870216, + "skip_count": 1.0, + "step": 2042, + "text_loss": 0.8299165368080139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009440539988001408, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3296648.0, + "repeat_count": 0.0, + "routers_loss": 0.011019332334399223, + "skip_count": 0.0, + "step": 2044, + "text_loss": 0.18207129836082458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009439116491535394, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3300058.0, + "repeat_count": 0.0, + "routers_loss": 0.002889640862122178, + "skip_count": 0.0, + "step": 2046, + "text_loss": 0.7051978707313538 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 9.615203991781627, + "f1_execute": 0.9333333373069763, + "f1_repeat": 0.5, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.078125, + "learning_rate": 0.0009437691293991563, + "loss": 0.0192, + "macro_f1": 0.7634921073913574, + "num_tokens": 3303296.0, + "repeat_count": 3.0, + "routers_loss": 0.07741832733154297, + "skip_count": 4.0, + "step": 2048, + "text_loss": 0.15563532710075378 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0009436264395916061, + "loss": 0.0209, + "macro_f1": 0.6666666865348816, + "num_tokens": 3306204.0, + "repeat_count": 0.0, + "routers_loss": 0.014225383289158344, + "skip_count": 2.0, + "step": 2050, + "text_loss": 0.18117287755012512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009434835797855672, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 3309444.0, + "repeat_count": 0.0, + "routers_loss": 0.0023932650219649076, + "skip_count": 0.0, + "step": 2052, + "text_loss": 0.4645874798297882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009433405500357839, + "loss": 0.0153, + "macro_f1": 0.3272727429866791, + "num_tokens": 3312488.0, + "repeat_count": 0.0, + "routers_loss": 0.03193361684679985, + "skip_count": 1.0, + "step": 2054, + "text_loss": 0.5291082859039307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009431973503970655, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3315765.0, + "repeat_count": 0.0, + "routers_loss": 0.0020529816392809153, + "skip_count": 0.0, + "step": 2056, + "text_loss": 0.5877931118011475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.66216612855885, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009430539809242864, + "loss": 0.0185, + "macro_f1": 0.32098764181137085, + "num_tokens": 3318877.0, + "repeat_count": 2.0, + "routers_loss": 0.07907948642969131, + "skip_count": 0.0, + "step": 2058, + "text_loss": 0.3836737871170044 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009429104416723862, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 3322576.0, + "repeat_count": 2.0, + "routers_loss": 0.003006070153787732, + "skip_count": 0.0, + "step": 2060, + "text_loss": 0.3480920195579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009427667326963689, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3325974.0, + "repeat_count": 0.0, + "routers_loss": 0.005013179033994675, + "skip_count": 0.0, + "step": 2062, + "text_loss": 0.931358814239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0009426228540513047, + "loss": 0.0206, + "macro_f1": 0.3333333432674408, + "num_tokens": 3329398.0, + "repeat_count": 0.0, + "routers_loss": 0.0059848143719136715, + "skip_count": 0.0, + "step": 2064, + "text_loss": 0.47568953037261963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009424788057923277, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3332029.0, + "repeat_count": 0.0, + "routers_loss": 0.00783882662653923, + "skip_count": 0.0, + "step": 2066, + "text_loss": 0.22887596487998962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.709128265336073, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009423345879746376, + "loss": 0.0128, + "macro_f1": 0.5492662787437439, + "num_tokens": 3334858.0, + "repeat_count": 0.0, + "routers_loss": 0.01866884157061577, + "skip_count": 2.0, + "step": 2068, + "text_loss": 0.17724967002868652 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.718520692691518, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.000942190200653499, + "loss": 0.0162, + "macro_f1": 0.32098764181137085, + "num_tokens": 3338094.0, + "repeat_count": 0.0, + "routers_loss": 0.028636593371629715, + "skip_count": 2.0, + "step": 2070, + "text_loss": 0.34344956278800964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.727913120046962, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009420456438842413, + "loss": 0.0165, + "macro_f1": 0.5492662787437439, + "num_tokens": 3340526.0, + "repeat_count": 0.0, + "routers_loss": 0.023245645686984062, + "skip_count": 2.0, + "step": 2072, + "text_loss": 0.7276164293289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.737305547402407, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.000941900917722259, + "loss": 0.0143, + "macro_f1": 0.3272727429866791, + "num_tokens": 3343303.0, + "repeat_count": 1.0, + "routers_loss": 0.01565689593553543, + "skip_count": 0.0, + "step": 2074, + "text_loss": 0.5665070414543152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009417560222230115, + "loss": 0.0245, + "macro_f1": 0.3333333432674408, + "num_tokens": 3346409.0, + "repeat_count": 0.0, + "routers_loss": 0.0035056080669164658, + "skip_count": 0.0, + "step": 2076, + "text_loss": 0.5112795233726501 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009416109574420229, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3349220.0, + "repeat_count": 0.0, + "routers_loss": 0.0027565446216613054, + "skip_count": 0.0, + "step": 2078, + "text_loss": 0.5240910053253174 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 9.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009414657234348823, + "loss": 0.0186, + "macro_f1": 1.0, + "num_tokens": 3352627.0, + "repeat_count": 3.0, + "routers_loss": 0.01652451977133751, + "skip_count": 2.0, + "step": 2080, + "text_loss": 1.0217112302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.774875256824185, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1630859375, + "learning_rate": 0.0009413203202572438, + "loss": 0.0179, + "macro_f1": 0.32098764181137085, + "num_tokens": 3355392.0, + "repeat_count": 0.0, + "routers_loss": 0.1012420505285263, + "skip_count": 2.0, + "step": 2082, + "text_loss": 0.4085482358932495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.000941174747964826, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3358425.0, + "repeat_count": 0.0, + "routers_loss": 0.004962718114256859, + "skip_count": 0.0, + "step": 2084, + "text_loss": 0.5833504796028137 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 9.793660111535075, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.800000011920929, + "grad_norm": 0.11376953125, + "learning_rate": 0.0009410290066134124, + "loss": 0.0211, + "macro_f1": 0.8083333373069763, + "num_tokens": 3361925.0, + "repeat_count": 2.0, + "routers_loss": 0.07889176905155182, + "skip_count": 3.0, + "step": 2086, + "text_loss": 0.38126569986343384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.803052538890519, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009408830962588517, + "loss": 0.0195, + "macro_f1": 0.6601307392120361, + "num_tokens": 3365963.0, + "repeat_count": 1.0, + "routers_loss": 0.033715736120939255, + "skip_count": 2.0, + "step": 2088, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009407370169570567, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3369422.0, + "repeat_count": 0.0, + "routers_loss": 0.0014188943896442652, + "skip_count": 0.0, + "step": 2090, + "text_loss": 0.4648318886756897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.82183739360141, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009405907687640054, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 3372506.0, + "repeat_count": 0.0, + "routers_loss": 0.015339684672653675, + "skip_count": 1.0, + "step": 2092, + "text_loss": 0.2563800811767578 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 9.831229820956853, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009404443517357404, + "loss": 0.0146, + "macro_f1": 0.542222261428833, + "num_tokens": 3375653.0, + "repeat_count": 4.0, + "routers_loss": 0.06562861055135727, + "skip_count": 0.0, + "step": 2094, + "text_loss": 0.797835111618042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.000940297765928369, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 3379018.0, + "repeat_count": 0.0, + "routers_loss": 0.005745889153331518, + "skip_count": 0.0, + "step": 2096, + "text_loss": 0.4238114655017853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009401510113980631, + "loss": 0.0207, + "macro_f1": 0.3333333432674408, + "num_tokens": 3382855.0, + "repeat_count": 0.0, + "routers_loss": 0.0026634482201188803, + "skip_count": 0.0, + "step": 2098, + "text_loss": 0.4967166483402252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009400040882010592, + "loss": 0.0166, + "macro_f1": 0.3333333432674408, + "num_tokens": 3386386.0, + "repeat_count": 0.0, + "routers_loss": 0.0020642587915062904, + "skip_count": 0.0, + "step": 2100, + "text_loss": 0.44390562176704407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0009398569963936589, + "loss": 0.017, + "macro_f1": 0.3272727429866791, + "num_tokens": 3389958.0, + "repeat_count": 0.0, + "routers_loss": 0.013722737319767475, + "skip_count": 1.0, + "step": 2102, + "text_loss": 0.7207565903663635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0009397097360322276, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3392892.0, + "repeat_count": 0.0, + "routers_loss": 0.002051608171314001, + "skip_count": 0.0, + "step": 2104, + "text_loss": 0.3196398913860321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.000939562307173196, + "loss": 0.022, + "macro_f1": 0.3333333432674408, + "num_tokens": 3396636.0, + "repeat_count": 0.0, + "routers_loss": 0.007085663266479969, + "skip_count": 0.0, + "step": 2106, + "text_loss": 0.5663776397705078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.896976812444967, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009394147098730592, + "loss": 0.02, + "macro_f1": 0.5492662787437439, + "num_tokens": 3399475.0, + "repeat_count": 0.0, + "routers_loss": 0.019473131746053696, + "skip_count": 2.0, + "step": 2108, + "text_loss": 0.7708223462104797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009392669441883767, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 3402350.0, + "repeat_count": 0.0, + "routers_loss": 0.0028328890912234783, + "skip_count": 0.0, + "step": 2110, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10693359375, + "learning_rate": 0.0009391190101757724, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3405561.0, + "repeat_count": 0.0, + "routers_loss": 0.023098422214388847, + "skip_count": 2.0, + "step": 2112, + "text_loss": 0.09865197539329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.000938970907891935, + "loss": 0.0247, + "macro_f1": 0.3333333432674408, + "num_tokens": 3408513.0, + "repeat_count": 0.0, + "routers_loss": 0.002896632067859173, + "skip_count": 0.0, + "step": 2114, + "text_loss": 0.6613234281539917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0947265625, + "learning_rate": 0.0009388226373936179, + "loss": 0.0211, + "macro_f1": 0.3333333432674408, + "num_tokens": 3411195.0, + "repeat_count": 0.0, + "routers_loss": 0.015814457088708878, + "skip_count": 0.0, + "step": 2116, + "text_loss": 0.17363053560256958 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.94393894922219, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.12451171875, + "learning_rate": 0.0009386741987376381, + "loss": 0.015, + "macro_f1": 0.6603773832321167, + "num_tokens": 3414875.0, + "repeat_count": 1.0, + "routers_loss": 0.02676783688366413, + "skip_count": 0.0, + "step": 2118, + "text_loss": 0.674056887626648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 9.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009385255919808778, + "loss": 0.0203, + "macro_f1": 0.6666666865348816, + "num_tokens": 3418410.0, + "repeat_count": 0.0, + "routers_loss": 0.01022857241332531, + "skip_count": 1.0, + "step": 2120, + "text_loss": 0.235092431306839 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 9.962723803933079, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0888671875, + "learning_rate": 0.0009383768171802836, + "loss": 0.0244, + "macro_f1": 0.5492662787437439, + "num_tokens": 3421289.0, + "repeat_count": 0.0, + "routers_loss": 0.013572212308645248, + "skip_count": 2.0, + "step": 2122, + "text_loss": 0.5992844104766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 9.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0009382278743928659, + "loss": 0.0201, + "macro_f1": 0.6666666865348816, + "num_tokens": 3424781.0, + "repeat_count": 0.0, + "routers_loss": 0.0051873656921088696, + "skip_count": 2.0, + "step": 2124, + "text_loss": 0.29915499687194824 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 9.981508658643968, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.07421875, + "learning_rate": 0.0009380787636757001, + "loss": 0.0155, + "macro_f1": 0.6122449040412903, + "num_tokens": 3427942.0, + "repeat_count": 0.0, + "routers_loss": 0.030079292133450508, + "skip_count": 4.0, + "step": 2126, + "text_loss": 0.24181491136550903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 9.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009379294850859256, + "loss": 0.0141, + "macro_f1": 0.3333333432674408, + "num_tokens": 3431314.0, + "repeat_count": 0.0, + "routers_loss": 0.002675612922757864, + "skip_count": 0.0, + "step": 2128, + "text_loss": 0.4669873118400574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10595703125, + "learning_rate": 0.0009377800386807465, + "loss": 0.0177, + "macro_f1": 0.3333333432674408, + "num_tokens": 3435020.0, + "repeat_count": 0.0, + "routers_loss": 0.009334275498986244, + "skip_count": 0.0, + "step": 2130, + "text_loss": 0.6478219628334045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.134765625, + "learning_rate": 0.0009376304245174306, + "loss": 0.0137, + "macro_f1": 0.6000000238418579, + "num_tokens": 3438276.0, + "repeat_count": 1.0, + "routers_loss": 0.038227908313274384, + "skip_count": 2.0, + "step": 2132, + "text_loss": 0.4401201903820038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009374806426533104, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3440938.0, + "repeat_count": 0.0, + "routers_loss": 0.006901399698108435, + "skip_count": 0.0, + "step": 2134, + "text_loss": 0.5948942303657532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009373306931457827, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3444028.0, + "repeat_count": 0.0, + "routers_loss": 0.0037061909679323435, + "skip_count": 0.0, + "step": 2136, + "text_loss": 0.5349751114845276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009371805760523086, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 3448331.0, + "repeat_count": 0.0, + "routers_loss": 0.0025877030566334724, + "skip_count": 0.0, + "step": 2138, + "text_loss": 0.4591051936149597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.046962136777223, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009370302914304129, + "loss": 0.0144, + "macro_f1": 0.5934640765190125, + "num_tokens": 3451434.0, + "repeat_count": 0.0, + "routers_loss": 0.018742674961686134, + "skip_count": 3.0, + "step": 2140, + "text_loss": 0.23470863699913025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.056354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009368798393376851, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 3454375.0, + "repeat_count": 0.0, + "routers_loss": 0.02382594160735607, + "skip_count": 1.0, + "step": 2142, + "text_loss": 0.6077954769134521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.065746991488112, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009367292198317787, + "loss": 0.0164, + "macro_f1": 0.5492662787437439, + "num_tokens": 3457591.0, + "repeat_count": 0.0, + "routers_loss": 0.03331060707569122, + "skip_count": 2.0, + "step": 2144, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009365784329704115, + "loss": 0.0186, + "macro_f1": 0.3333333432674408, + "num_tokens": 3460895.0, + "repeat_count": 0.0, + "routers_loss": 0.0016955457394942641, + "skip_count": 0.0, + "step": 2146, + "text_loss": 0.3947436511516571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009364274788113651, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 3464101.0, + "repeat_count": 1.0, + "routers_loss": 0.006169239990413189, + "skip_count": 0.0, + "step": 2148, + "text_loss": 0.3348555266857147 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 10.093924273554446, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009362763574124858, + "loss": 0.019, + "macro_f1": 0.9265305995941162, + "num_tokens": 3467417.0, + "repeat_count": 3.0, + "routers_loss": 0.024033790454268456, + "skip_count": 1.0, + "step": 2150, + "text_loss": 0.496633380651474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009361250688316829, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3470917.0, + "repeat_count": 0.0, + "routers_loss": 0.0024986129719763994, + "skip_count": 0.0, + "step": 2152, + "text_loss": 0.6857671737670898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009359736131269312, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3473624.0, + "repeat_count": 0.0, + "routers_loss": 0.008183322846889496, + "skip_count": 1.0, + "step": 2154, + "text_loss": 0.13883116841316223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009358219903562684, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 3476472.0, + "repeat_count": 0.0, + "routers_loss": 0.011198793537914753, + "skip_count": 3.0, + "step": 2156, + "text_loss": 0.24243666231632233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009356702005777969, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3479688.0, + "repeat_count": 0.0, + "routers_loss": 0.002520184963941574, + "skip_count": 0.0, + "step": 2158, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009355182438496825, + "loss": 0.0142, + "macro_f1": 0.3333333432674408, + "num_tokens": 3482598.0, + "repeat_count": 0.0, + "routers_loss": 0.0011065017897635698, + "skip_count": 0.0, + "step": 2160, + "text_loss": 0.7214245796203613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009353661202301557, + "loss": 0.0144, + "macro_f1": 0.3333333432674408, + "num_tokens": 3486271.0, + "repeat_count": 0.0, + "routers_loss": 0.0017824085662141442, + "skip_count": 0.0, + "step": 2162, + "text_loss": 0.5140969157218933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009352138297775101, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 3489206.0, + "repeat_count": 0.0, + "routers_loss": 0.001542879967018962, + "skip_count": 0.0, + "step": 2164, + "text_loss": 0.7956416606903076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.000935061372550104, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3492003.0, + "repeat_count": 0.0, + "routers_loss": 0.01420794241130352, + "skip_count": 3.0, + "step": 2166, + "text_loss": 0.27489882707595825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009349087486063594, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3494784.0, + "repeat_count": 0.0, + "routers_loss": 0.003614309709519148, + "skip_count": 1.0, + "step": 2168, + "text_loss": 0.2962227761745453 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.187848547108894, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009347559580047618, + "loss": 0.0175, + "macro_f1": 0.8814815282821655, + "num_tokens": 3497886.0, + "repeat_count": 2.0, + "routers_loss": 0.02122853323817253, + "skip_count": 4.0, + "step": 2170, + "text_loss": 0.5919580459594727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06396484375, + "learning_rate": 0.000934603000803861, + "loss": 0.0135, + "macro_f1": 0.5492662787437439, + "num_tokens": 3500939.0, + "repeat_count": 0.0, + "routers_loss": 0.02042219042778015, + "skip_count": 1.0, + "step": 2172, + "text_loss": 0.28722381591796875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009344498770622704, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3504852.0, + "repeat_count": 0.0, + "routers_loss": 0.004345106892287731, + "skip_count": 0.0, + "step": 2174, + "text_loss": 0.603236734867096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1064453125, + "learning_rate": 0.0009342965868386673, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 3508320.0, + "repeat_count": 0.0, + "routers_loss": 0.00368050136603415, + "skip_count": 0.0, + "step": 2176, + "text_loss": 0.6020491719245911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.000934143130191793, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 3511278.0, + "repeat_count": 0.0, + "routers_loss": 0.013425769284367561, + "skip_count": 0.0, + "step": 2178, + "text_loss": 0.5954724550247192 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000933989507180452, + "loss": 0.0149, + "macro_f1": 0.3333333432674408, + "num_tokens": 3514361.0, + "repeat_count": 0.0, + "routers_loss": 0.002896249992772937, + "skip_count": 0.0, + "step": 2180, + "text_loss": 0.39175131916999817 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.244203111241562, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009338357178635135, + "loss": 0.0147, + "macro_f1": 0.6603773832321167, + "num_tokens": 3517962.0, + "repeat_count": 1.0, + "routers_loss": 0.011538350023329258, + "skip_count": 1.0, + "step": 2182, + "text_loss": 0.4482830762863159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.253595538597006, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0009336817622999093, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 3521299.0, + "repeat_count": 1.0, + "routers_loss": 0.022787930443882942, + "skip_count": 0.0, + "step": 2184, + "text_loss": 0.35177817940711975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.262987965952451, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009335276405486357, + "loss": 0.0139, + "macro_f1": 0.3272727429866791, + "num_tokens": 3524611.0, + "repeat_count": 0.0, + "routers_loss": 0.011597735807299614, + "skip_count": 1.0, + "step": 2186, + "text_loss": 0.24868851900100708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0009333733526687524, + "loss": 0.0196, + "macro_f1": 0.3333333432674408, + "num_tokens": 3528012.0, + "repeat_count": 0.0, + "routers_loss": 0.014253967441618443, + "skip_count": 0.0, + "step": 2188, + "text_loss": 0.3970910310745239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000933218898719383, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3530908.0, + "repeat_count": 0.0, + "routers_loss": 0.001659149187617004, + "skip_count": 0.0, + "step": 2190, + "text_loss": 0.7618573307991028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009330642787597141, + "loss": 0.0159, + "macro_f1": 0.3333333432674408, + "num_tokens": 3533993.0, + "repeat_count": 0.0, + "routers_loss": 0.005574346985667944, + "skip_count": 0.0, + "step": 2192, + "text_loss": 0.16470147669315338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009329094928489969, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3537310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026400673668831587, + "skip_count": 0.0, + "step": 2194, + "text_loss": 0.3400416374206543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 10.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009327545410465452, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3540045.0, + "repeat_count": 0.0, + "routers_loss": 0.008448398672044277, + "skip_count": 3.0, + "step": 2196, + "text_loss": 0.3110542297363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.31934253008512, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009325994234117372, + "loss": 0.0122, + "macro_f1": 0.32098764181137085, + "num_tokens": 3544097.0, + "repeat_count": 0.0, + "routers_loss": 0.037553198635578156, + "skip_count": 2.0, + "step": 2198, + "text_loss": 0.36126700043678284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.000932444140004014, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3547054.0, + "repeat_count": 1.0, + "routers_loss": 0.006464479025453329, + "skip_count": 0.0, + "step": 2200, + "text_loss": 0.4947047233581543 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1015625, + "learning_rate": 0.0009322886908828805, + "loss": 0.0138, + "macro_f1": 0.6666666865348816, + "num_tokens": 3549903.0, + "repeat_count": 1.0, + "routers_loss": 0.005384812597185373, + "skip_count": 0.0, + "step": 2202, + "text_loss": 0.5923738479614258 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009321330761079052, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3553745.0, + "repeat_count": 0.0, + "routers_loss": 0.015346619300544262, + "skip_count": 2.0, + "step": 2204, + "text_loss": 0.1904175877571106 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.356912239506897, + "f1_execute": 0.9268292784690857, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06494140625, + "learning_rate": 0.00093197729573872, + "loss": 0.0203, + "macro_f1": 0.8422764539718628, + "num_tokens": 3557235.0, + "repeat_count": 3.0, + "routers_loss": 0.1207597479224205, + "skip_count": 6.0, + "step": 2206, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0009318213498350202, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3560795.0, + "repeat_count": 0.0, + "routers_loss": 0.003334777895361185, + "skip_count": 0.0, + "step": 2208, + "text_loss": 0.4268290102481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009316652384565645, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3563754.0, + "repeat_count": 0.0, + "routers_loss": 0.004230072256177664, + "skip_count": 0.0, + "step": 2210, + "text_loss": 0.40049710869789124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0009315089616631751, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 3567173.0, + "repeat_count": 0.0, + "routers_loss": 0.0006645230459980667, + "skip_count": 0.0, + "step": 2212, + "text_loss": 0.42568323016166687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0009313525195147376, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3570831.0, + "repeat_count": 0.0, + "routers_loss": 0.0097877848893404, + "skip_count": 0.0, + "step": 2214, + "text_loss": 0.45808279514312744 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 10.40387437628412, + "f1_execute": 0.9387754797935486, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.5, + "grad_norm": 0.076171875, + "learning_rate": 0.000931195912071201, + "loss": 0.0187, + "macro_f1": 0.7018141150474548, + "num_tokens": 3573745.0, + "repeat_count": 2.0, + "routers_loss": 0.07351134717464447, + "skip_count": 3.0, + "step": 2216, + "text_loss": 0.285696804523468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009310391393925775, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 3576785.0, + "repeat_count": 0.0, + "routers_loss": 0.0033160944003611803, + "skip_count": 0.0, + "step": 2218, + "text_loss": 0.17516443133354187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.422659230995011, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009308822015389424, + "loss": 0.0241, + "macro_f1": 0.5427350401878357, + "num_tokens": 3580695.0, + "repeat_count": 1.0, + "routers_loss": 0.052930232137441635, + "skip_count": 1.0, + "step": 2220, + "text_loss": 0.5918155908584595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.432051658350455, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.072265625, + "learning_rate": 0.0009307250985704352, + "loss": 0.0128, + "macro_f1": 0.6122449040412903, + "num_tokens": 3583729.0, + "repeat_count": 0.0, + "routers_loss": 0.025454653427004814, + "skip_count": 4.0, + "step": 2222, + "text_loss": 0.2652169466018677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009305678305472575, + "loss": 0.0158, + "macro_f1": 0.3333333432674408, + "num_tokens": 3586775.0, + "repeat_count": 0.0, + "routers_loss": 0.011279845610260963, + "skip_count": 0.0, + "step": 2224, + "text_loss": 0.3511691987514496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10791015625, + "learning_rate": 0.000930410397529675, + "loss": 0.017, + "macro_f1": 0.3333333432674408, + "num_tokens": 3589676.0, + "repeat_count": 0.0, + "routers_loss": 0.002700264798477292, + "skip_count": 0.0, + "step": 2226, + "text_loss": 0.24045433104038239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.000930252799578016, + "loss": 0.0146, + "macro_f1": 1.0, + "num_tokens": 3593242.0, + "repeat_count": 1.0, + "routers_loss": 0.00826631672680378, + "skip_count": 2.0, + "step": 2228, + "text_loss": 0.3777645528316498 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.469621367772234, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009300950367526728, + "loss": 0.0131, + "macro_f1": 0.8820862174034119, + "num_tokens": 3596807.0, + "repeat_count": 2.0, + "routers_loss": 0.036221496760845184, + "skip_count": 2.0, + "step": 2230, + "text_loss": 0.502962589263916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0009299371091141001, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3600150.0, + "repeat_count": 0.0, + "routers_loss": 0.006449893582612276, + "skip_count": 0.0, + "step": 2232, + "text_loss": 0.20256924629211426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009297790167228161, + "loss": 0.012, + "macro_f1": 0.6666666865348816, + "num_tokens": 3602988.0, + "repeat_count": 0.0, + "routers_loss": 0.007872486487030983, + "skip_count": 2.0, + "step": 2234, + "text_loss": 0.42476826906204224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.497798649838568, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009296207596394022, + "loss": 0.0101, + "macro_f1": 0.32098764181137085, + "num_tokens": 3606071.0, + "repeat_count": 0.0, + "routers_loss": 0.027397040277719498, + "skip_count": 2.0, + "step": 2236, + "text_loss": 0.23432791233062744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009294623379245028, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3609389.0, + "repeat_count": 0.0, + "routers_loss": 0.01042645052075386, + "skip_count": 0.0, + "step": 2238, + "text_loss": 0.16665785014629364 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009293037516388252, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3612105.0, + "repeat_count": 0.0, + "routers_loss": 0.0012458425480872393, + "skip_count": 0.0, + "step": 2240, + "text_loss": 0.59421306848526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009291450008431404, + "loss": 0.0185, + "macro_f1": 1.0, + "num_tokens": 3615439.0, + "repeat_count": 1.0, + "routers_loss": 0.005781981628388166, + "skip_count": 1.0, + "step": 2242, + "text_loss": 0.510798454284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 10.535368359260346, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.0966796875, + "learning_rate": 0.0009289860855982814, + "loss": 0.0166, + "macro_f1": 0.4871794879436493, + "num_tokens": 3618842.0, + "repeat_count": 0.0, + "routers_loss": 0.031195320188999176, + "skip_count": 3.0, + "step": 2244, + "text_loss": 0.7574363350868225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009288270059651454, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 3621823.0, + "repeat_count": 0.0, + "routers_loss": 0.001746491645462811, + "skip_count": 0.0, + "step": 2246, + "text_loss": 0.5125683546066284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.554153213971237, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.220703125, + "learning_rate": 0.0009286677620046918, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3624502.0, + "repeat_count": 0.0, + "routers_loss": 0.03792348504066467, + "skip_count": 2.0, + "step": 2248, + "text_loss": 0.7533677220344543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0009285083537779429, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3627057.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684451506473124, + "skip_count": 0.0, + "step": 2250, + "text_loss": 0.2219279706478119 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.572938068682125, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0009283487813459845, + "loss": 0.0148, + "macro_f1": 0.5492662787437439, + "num_tokens": 3629720.0, + "repeat_count": 0.0, + "routers_loss": 0.022757573053240776, + "skip_count": 2.0, + "step": 2252, + "text_loss": 0.6903313994407654 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009281890447699652, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 3633234.0, + "repeat_count": 1.0, + "routers_loss": 0.003613058477640152, + "skip_count": 0.0, + "step": 2254, + "text_loss": 0.6278893351554871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009280291441110961, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3636289.0, + "repeat_count": 0.0, + "routers_loss": 0.006214062683284283, + "skip_count": 0.0, + "step": 2256, + "text_loss": 0.3011114001274109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.60111535074846, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.041015625, + "learning_rate": 0.0009278690794306517, + "loss": 0.014, + "macro_f1": 0.5492662787437439, + "num_tokens": 3640251.0, + "repeat_count": 0.0, + "routers_loss": 0.052556321024894714, + "skip_count": 2.0, + "step": 2258, + "text_loss": 0.19894185662269592 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 10.610507778103903, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.08251953125, + "learning_rate": 0.0009277088507899689, + "loss": 0.0163, + "macro_f1": 0.9452888369560242, + "num_tokens": 3643527.0, + "repeat_count": 4.0, + "routers_loss": 0.0572301521897316, + "skip_count": 1.0, + "step": 2260, + "text_loss": 0.5593410134315491 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009275484582504475, + "loss": 0.0104, + "macro_f1": 0.3333333432674408, + "num_tokens": 3646959.0, + "repeat_count": 0.0, + "routers_loss": 0.008010074496269226, + "skip_count": 0.0, + "step": 2262, + "text_loss": 0.2128177285194397 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.629292632814794, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.800000011920929, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009273879018735505, + "loss": 0.0138, + "macro_f1": 0.8521739840507507, + "num_tokens": 3651298.0, + "repeat_count": 3.0, + "routers_loss": 0.035729870200157166, + "skip_count": 3.0, + "step": 2264, + "text_loss": 0.2987811267375946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1474609375, + "learning_rate": 0.0009272271817208031, + "loss": 0.0182, + "macro_f1": 0.3333333432674408, + "num_tokens": 3655609.0, + "repeat_count": 0.0, + "routers_loss": 0.002379779238253832, + "skip_count": 0.0, + "step": 2266, + "text_loss": 0.6024088263511658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009270662978537939, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 3658444.0, + "repeat_count": 0.0, + "routers_loss": 0.008943650871515274, + "skip_count": 0.0, + "step": 2268, + "text_loss": 0.1741207242012024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 10.657469914881126, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009269052503341736, + "loss": 0.0161, + "macro_f1": 0.6595745086669922, + "num_tokens": 3662282.0, + "repeat_count": 1.0, + "routers_loss": 0.030201267451047897, + "skip_count": 4.0, + "step": 2270, + "text_loss": 0.7300035953521729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009267440392236562, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 3665531.0, + "repeat_count": 0.0, + "routers_loss": 0.0026635683607310057, + "skip_count": 0.0, + "step": 2272, + "text_loss": 0.31535038352012634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009265826645840178, + "loss": 0.0151, + "macro_f1": 0.3333333432674408, + "num_tokens": 3668407.0, + "repeat_count": 0.0, + "routers_loss": 0.004258926957845688, + "skip_count": 0.0, + "step": 2274, + "text_loss": 0.7272579073905945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 10.68564719694746, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.125, + "learning_rate": 0.0009264211264770976, + "loss": 0.0154, + "macro_f1": 0.6122449040412903, + "num_tokens": 3671503.0, + "repeat_count": 0.0, + "routers_loss": 0.038987524807453156, + "skip_count": 4.0, + "step": 2276, + "text_loss": 0.7488982677459717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009262594249647975, + "loss": 0.0164, + "macro_f1": 0.6666666865348816, + "num_tokens": 3674107.0, + "repeat_count": 0.0, + "routers_loss": 0.007211760152131319, + "skip_count": 1.0, + "step": 2278, + "text_loss": 0.1992369294166565 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 10.704432051658351, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0009260975601090815, + "loss": 0.0112, + "macro_f1": 0.9446290731430054, + "num_tokens": 3677184.0, + "repeat_count": 4.0, + "routers_loss": 0.02538592554628849, + "skip_count": 3.0, + "step": 2280, + "text_loss": 0.46402135491371155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0009259355319719768, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 3680683.0, + "repeat_count": 0.0, + "routers_loss": 0.0038464947137981653, + "skip_count": 0.0, + "step": 2282, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1611328125, + "learning_rate": 0.0009257733406155726, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3683928.0, + "repeat_count": 0.0, + "routers_loss": 0.004841136280447245, + "skip_count": 0.0, + "step": 2284, + "text_loss": 0.4834538400173187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009256109861020212, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 3687101.0, + "repeat_count": 0.0, + "routers_loss": 0.002191900508478284, + "skip_count": 0.0, + "step": 2286, + "text_loss": 0.8199604749679565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.742001761080129, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0927734375, + "learning_rate": 0.000925448468493537, + "loss": 0.0162, + "macro_f1": 0.5427350401878357, + "num_tokens": 3690490.0, + "repeat_count": 1.0, + "routers_loss": 0.03488675877451897, + "skip_count": 2.0, + "step": 2288, + "text_loss": 0.33263635635375977 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 10.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009252857878523971, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 3694109.0, + "repeat_count": 1.0, + "routers_loss": 0.002897309372201562, + "skip_count": 0.0, + "step": 2290, + "text_loss": 0.47494807839393616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.000925122944240941, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 3697233.0, + "repeat_count": 0.0, + "routers_loss": 0.01842675730586052, + "skip_count": 2.0, + "step": 2292, + "text_loss": 0.14693495631217957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.770179043146463, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009249599377215707, + "loss": 0.0146, + "macro_f1": 0.5866667032241821, + "num_tokens": 3700376.0, + "repeat_count": 1.0, + "routers_loss": 0.04169808700680733, + "skip_count": 3.0, + "step": 2294, + "text_loss": 0.38051268458366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.779571470501908, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009247967683567507, + "loss": 0.0112, + "macro_f1": 0.3272727429866791, + "num_tokens": 3703212.0, + "repeat_count": 0.0, + "routers_loss": 0.012183113023638725, + "skip_count": 1.0, + "step": 2296, + "text_loss": 0.23789077997207642 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.788963897857352, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009246334362090077, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3706490.0, + "repeat_count": 1.0, + "routers_loss": 0.01880069635808468, + "skip_count": 2.0, + "step": 2298, + "text_loss": 0.29067978262901306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.000924469941340931, + "loss": 0.0173, + "macro_f1": 0.3272727429866791, + "num_tokens": 3709804.0, + "repeat_count": 1.0, + "routers_loss": 0.027359159663319588, + "skip_count": 0.0, + "step": 2300, + "text_loss": 0.67828369140625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000924306283815172, + "loss": 0.0153, + "macro_f1": 0.3333333432674408, + "num_tokens": 3712824.0, + "repeat_count": 0.0, + "routers_loss": 0.003152279881760478, + "skip_count": 0.0, + "step": 2302, + "text_loss": 0.8333184719085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 10.817141179923686, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0703125, + "learning_rate": 0.0009241424636944445, + "loss": 0.0159, + "macro_f1": 0.5492662787437439, + "num_tokens": 3715385.0, + "repeat_count": 0.0, + "routers_loss": 0.0442950464785099, + "skip_count": 2.0, + "step": 2304, + "text_loss": 0.41893699765205383 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 10.826533607279131, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009239784810415249, + "loss": 0.0137, + "macro_f1": 0.8823530077934265, + "num_tokens": 3719080.0, + "repeat_count": 1.0, + "routers_loss": 0.015729321166872978, + "skip_count": 2.0, + "step": 2306, + "text_loss": 0.13360483944416046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 10.835926034634575, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009238143359192514, + "loss": 0.0136, + "macro_f1": 0.5934640765190125, + "num_tokens": 3722439.0, + "repeat_count": 0.0, + "routers_loss": 0.028816604986786842, + "skip_count": 3.0, + "step": 2308, + "text_loss": 0.39594101905822754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 10.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.000923650028390525, + "loss": 0.0166, + "macro_f1": 0.6666666865348816, + "num_tokens": 3725092.0, + "repeat_count": 0.0, + "routers_loss": 0.0036455015651881695, + "skip_count": 2.0, + "step": 2310, + "text_loss": 0.6169708371162415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009234855585183086, + "loss": 0.014, + "macro_f1": 0.6666666865348816, + "num_tokens": 3728412.0, + "repeat_count": 0.0, + "routers_loss": 0.007565604057163, + "skip_count": 1.0, + "step": 2312, + "text_loss": 0.21257059276103973 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 10.86410331670091, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009233209263656273, + "loss": 0.0184, + "macro_f1": 0.9262410998344421, + "num_tokens": 3731467.0, + "repeat_count": 2.0, + "routers_loss": 0.02510629966855049, + "skip_count": 3.0, + "step": 2314, + "text_loss": 0.21639840304851532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009231561319955684, + "loss": 0.0154, + "macro_f1": 0.3333333432674408, + "num_tokens": 3734906.0, + "repeat_count": 0.0, + "routers_loss": 0.00872227642685175, + "skip_count": 0.0, + "step": 2316, + "text_loss": 0.35639774799346924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009229911754712815, + "loss": 0.0176, + "macro_f1": 0.3333333432674408, + "num_tokens": 3737943.0, + "repeat_count": 0.0, + "routers_loss": 0.004695790819823742, + "skip_count": 0.0, + "step": 2318, + "text_loss": 0.5269573330879211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.892280598767243, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009228260568559781, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 3741833.0, + "repeat_count": 1.0, + "routers_loss": 0.0217357836663723, + "skip_count": 0.0, + "step": 2320, + "text_loss": 0.5110208988189697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.901673026122689, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1953125, + "learning_rate": 0.0009226607762129322, + "loss": 0.0201, + "macro_f1": 0.32098764181137085, + "num_tokens": 3744642.0, + "repeat_count": 1.0, + "routers_loss": 0.05595960095524788, + "skip_count": 1.0, + "step": 2322, + "text_loss": 0.6291998624801636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009224953336054796, + "loss": 0.0161, + "macro_f1": 0.3333333432674408, + "num_tokens": 3748127.0, + "repeat_count": 0.0, + "routers_loss": 0.0071634589694440365, + "skip_count": 0.0, + "step": 2324, + "text_loss": 0.7404762506484985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.000922329729097018, + "loss": 0.0169, + "macro_f1": 0.3333333432674408, + "num_tokens": 3751373.0, + "repeat_count": 0.0, + "routers_loss": 0.0011676300782710314, + "skip_count": 0.0, + "step": 2326, + "text_loss": 0.2915459871292114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009221639627510075, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 3754518.0, + "repeat_count": 0.0, + "routers_loss": 0.01039792038500309, + "skip_count": 0.0, + "step": 2328, + "text_loss": 0.22066321969032288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009219980346309702, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3757621.0, + "repeat_count": 0.0, + "routers_loss": 0.0032070958986878395, + "skip_count": 0.0, + "step": 2330, + "text_loss": 0.5558560490608215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.076171875, + "learning_rate": 0.0009218319448004899, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 3760885.0, + "repeat_count": 0.0, + "routers_loss": 0.007085457909852266, + "skip_count": 0.0, + "step": 2332, + "text_loss": 0.4348253607749939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 10.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1103515625, + "learning_rate": 0.0009216656933232129, + "loss": 0.016, + "macro_f1": 0.6666666865348816, + "num_tokens": 3764462.0, + "repeat_count": 0.0, + "routers_loss": 0.005504854489117861, + "skip_count": 1.0, + "step": 2334, + "text_loss": 0.35828644037246704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009214992802628463, + "loss": 0.0131, + "macro_f1": 0.3333333432674408, + "num_tokens": 3767159.0, + "repeat_count": 0.0, + "routers_loss": 0.0013970810687169433, + "skip_count": 0.0, + "step": 2336, + "text_loss": 0.2956557869911194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009213327056831607, + "loss": 0.0181, + "macro_f1": 0.3272727429866791, + "num_tokens": 3770408.0, + "repeat_count": 0.0, + "routers_loss": 0.0427570566534996, + "skip_count": 1.0, + "step": 2338, + "text_loss": 0.14883014559745789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.986204872321691, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0009211659696479875, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 3773474.0, + "repeat_count": 0.0, + "routers_loss": 0.0011273405980318785, + "skip_count": 0.0, + "step": 2340, + "text_loss": 0.26011669635772705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 10.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.00092099907222122, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3776909.0, + "repeat_count": 0.0, + "routers_loss": 0.0016178421210497618, + "skip_count": 0.0, + "step": 2342, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.000920832013466814, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 3780741.0, + "repeat_count": 0.0, + "routers_loss": 0.005510095041245222, + "skip_count": 0.0, + "step": 2344, + "text_loss": 0.4870249927043915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0009206647934487866, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3784673.0, + "repeat_count": 1.0, + "routers_loss": 0.0047357892617583275, + "skip_count": 0.0, + "step": 2346, + "text_loss": 0.3251725733280182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009204974122312167, + "loss": 0.0142, + "macro_f1": 0.6666666865348816, + "num_tokens": 3787503.0, + "repeat_count": 0.0, + "routers_loss": 0.00795028731226921, + "skip_count": 1.0, + "step": 2348, + "text_loss": 0.18282145261764526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0009203298698782452, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 3790528.0, + "repeat_count": 1.0, + "routers_loss": 0.0009506374481134117, + "skip_count": 0.0, + "step": 2350, + "text_loss": 0.4093080461025238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009201621664540747, + "loss": 0.0155, + "macro_f1": 0.6666666865348816, + "num_tokens": 3794134.0, + "repeat_count": 1.0, + "routers_loss": 0.005159572698175907, + "skip_count": 0.0, + "step": 2352, + "text_loss": 0.5451981425285339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009199943020229694, + "loss": 0.0148, + "macro_f1": 0.3333333432674408, + "num_tokens": 3797414.0, + "repeat_count": 0.0, + "routers_loss": 0.002356168581172824, + "skip_count": 0.0, + "step": 2354, + "text_loss": 0.3070453405380249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009198262766492554, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 3800094.0, + "repeat_count": 0.0, + "routers_loss": 0.0051761893555521965, + "skip_count": 1.0, + "step": 2356, + "text_loss": 0.5880904197692871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00091965809039732, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3803280.0, + "repeat_count": 0.0, + "routers_loss": 0.0025952060241252184, + "skip_count": 0.0, + "step": 2358, + "text_loss": 0.5210731625556946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0009194897433316127, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 3805866.0, + "repeat_count": 0.0, + "routers_loss": 0.0042560105212032795, + "skip_count": 2.0, + "step": 2360, + "text_loss": 0.6472984552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009193212355166446, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3808952.0, + "repeat_count": 0.0, + "routers_loss": 0.0026232977397739887, + "skip_count": 0.0, + "step": 2362, + "text_loss": 0.450063556432724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009191525670169881, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3812080.0, + "repeat_count": 0.0, + "routers_loss": 0.0034355956595391035, + "skip_count": 0.0, + "step": 2364, + "text_loss": 0.49727216362953186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.000918983737897277, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 3815282.0, + "repeat_count": 0.0, + "routers_loss": 0.0055653867311775684, + "skip_count": 1.0, + "step": 2366, + "text_loss": 0.6336377859115601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0009188147482222071, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 3818106.0, + "repeat_count": 2.0, + "routers_loss": 0.011016021482646465, + "skip_count": 2.0, + "step": 2368, + "text_loss": 0.22513329982757568 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009186455980565358, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3821228.0, + "repeat_count": 1.0, + "routers_loss": 0.014039464294910431, + "skip_count": 0.0, + "step": 2370, + "text_loss": 0.21331638097763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009184762874650816, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 3825048.0, + "repeat_count": 0.0, + "routers_loss": 0.001088051125407219, + "skip_count": 0.0, + "step": 2372, + "text_loss": 0.6031543612480164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.095703125, + "learning_rate": 0.0009183068165127245, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 3828781.0, + "repeat_count": 0.0, + "routers_loss": 0.006263940595090389, + "skip_count": 1.0, + "step": 2374, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009181371852644062, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 3832507.0, + "repeat_count": 1.0, + "routers_loss": 0.001987969037145376, + "skip_count": 0.0, + "step": 2376, + "text_loss": 0.37972065806388855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009179673937851299, + "loss": 0.0158, + "macro_f1": 0.6666666865348816, + "num_tokens": 3835644.0, + "repeat_count": 0.0, + "routers_loss": 0.007635094691067934, + "skip_count": 1.0, + "step": 2378, + "text_loss": 0.46319663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009177974421399598, + "loss": 0.0137, + "macro_f1": 0.6666666865348816, + "num_tokens": 3838700.0, + "repeat_count": 0.0, + "routers_loss": 0.01617279462516308, + "skip_count": 2.0, + "step": 2380, + "text_loss": 0.32141056656837463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009176273303940217, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 3841953.0, + "repeat_count": 0.0, + "routers_loss": 0.0022273799404501915, + "skip_count": 2.0, + "step": 2382, + "text_loss": 0.5908139944076538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.192544760786616, + "f1_execute": 0.9629629850387573, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009174570586125026, + "loss": 0.0122, + "macro_f1": 0.32098767161369324, + "num_tokens": 3845763.0, + "repeat_count": 1.0, + "routers_loss": 0.030915161594748497, + "skip_count": 0.0, + "step": 2384, + "text_loss": 0.41400137543678284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009172866268606513, + "loss": 0.0122, + "macro_f1": 0.6666666865348816, + "num_tokens": 3848984.0, + "repeat_count": 0.0, + "routers_loss": 0.010480951517820358, + "skip_count": 2.0, + "step": 2386, + "text_loss": 0.2560874819755554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009171160352037775, + "loss": 0.0124, + "macro_f1": 0.6666666865348816, + "num_tokens": 3852118.0, + "repeat_count": 0.0, + "routers_loss": 0.00809961836785078, + "skip_count": 1.0, + "step": 2388, + "text_loss": 0.28236693143844604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009169452837072521, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 3855314.0, + "repeat_count": 1.0, + "routers_loss": 0.005569872446358204, + "skip_count": 1.0, + "step": 2390, + "text_loss": 0.4578137695789337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1123046875, + "learning_rate": 0.0009167743724365073, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 3858301.0, + "repeat_count": 0.0, + "routers_loss": 0.0038610948249697685, + "skip_count": 1.0, + "step": 2392, + "text_loss": 0.14082716405391693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1376953125, + "learning_rate": 0.0009166033014570368, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3861296.0, + "repeat_count": 0.0, + "routers_loss": 0.0017607157351449132, + "skip_count": 0.0, + "step": 2394, + "text_loss": 0.384442001581192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009164320708343954, + "loss": 0.0131, + "macro_f1": 0.6666666865348816, + "num_tokens": 3863985.0, + "repeat_count": 2.0, + "routers_loss": 0.009627950377762318, + "skip_count": 0.0, + "step": 2396, + "text_loss": 0.6969521045684814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009162606806341989, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 3866636.0, + "repeat_count": 0.0, + "routers_loss": 0.006915586534887552, + "skip_count": 0.0, + "step": 2398, + "text_loss": 0.48069697618484497 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009160891309221242, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 3870867.0, + "repeat_count": 1.0, + "routers_loss": 0.0013031222624704242, + "skip_count": 0.0, + "step": 2400, + "text_loss": 0.3882075846195221 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.277076606985618, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009159174217639096, + "loss": 0.0112, + "macro_f1": 0.5427350401878357, + "num_tokens": 3873663.0, + "repeat_count": 2.0, + "routers_loss": 0.06621067970991135, + "skip_count": 1.0, + "step": 2402, + "text_loss": 0.5740041136741638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0009157455532253547, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3876788.0, + "repeat_count": 1.0, + "routers_loss": 0.005957918707281351, + "skip_count": 0.0, + "step": 2404, + "text_loss": 0.26025933027267456 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 11.295861461696507, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.08642578125, + "learning_rate": 0.0009155735253723191, + "loss": 0.0126, + "macro_f1": 0.9452888369560242, + "num_tokens": 3879942.0, + "repeat_count": 1.0, + "routers_loss": 0.039429809898138046, + "skip_count": 4.0, + "step": 2406, + "text_loss": 1.1349908113479614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009154013382707251, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 3882682.0, + "repeat_count": 0.0, + "routers_loss": 0.0012570557883009315, + "skip_count": 0.0, + "step": 2408, + "text_loss": 0.5611135363578796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0009152289919865543, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 3886425.0, + "repeat_count": 0.0, + "routers_loss": 0.0017455556662753224, + "skip_count": 0.0, + "step": 2410, + "text_loss": 0.7523751854896545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0009150564865858506, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3889273.0, + "repeat_count": 0.0, + "routers_loss": 0.011178011074662209, + "skip_count": 1.0, + "step": 2412, + "text_loss": 0.26942551136016846 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 11.333431171118287, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009148838221347182, + "loss": 0.0107, + "macro_f1": 0.5934640765190125, + "num_tokens": 3892199.0, + "repeat_count": 3.0, + "routers_loss": 0.019628092646598816, + "skip_count": 0.0, + "step": 2414, + "text_loss": 0.5492315888404846 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009147109986993225, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 3895362.0, + "repeat_count": 1.0, + "routers_loss": 0.012255983427166939, + "skip_count": 0.0, + "step": 2416, + "text_loss": 0.23798216879367828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11669921875, + "learning_rate": 0.0009145380163458899, + "loss": 0.0178, + "macro_f1": 0.3333333432674408, + "num_tokens": 3898476.0, + "repeat_count": 0.0, + "routers_loss": 0.007018954027444124, + "skip_count": 0.0, + "step": 2418, + "text_loss": 0.1923145055770874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009143648751407074, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 3901817.0, + "repeat_count": 0.0, + "routers_loss": 0.0008574824314564466, + "skip_count": 0.0, + "step": 2420, + "text_loss": 0.4001806974411011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.371000880540064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.11328125, + "learning_rate": 0.0009141915751501231, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 3905461.0, + "repeat_count": 0.0, + "routers_loss": 0.01572350226342678, + "skip_count": 2.0, + "step": 2422, + "text_loss": 0.19519129395484924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009140181164405458, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 3908878.0, + "repeat_count": 0.0, + "routers_loss": 0.0005503420252352953, + "skip_count": 0.0, + "step": 2424, + "text_loss": 0.6937088370323181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009138444990784454, + "loss": 0.013, + "macro_f1": 0.3333333432674408, + "num_tokens": 3912053.0, + "repeat_count": 0.0, + "routers_loss": 0.007556677330285311, + "skip_count": 0.0, + "step": 2426, + "text_loss": 0.35431069135665894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000913670723130352, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 3915192.0, + "repeat_count": 0.0, + "routers_loss": 0.0013609991874545813, + "skip_count": 0.0, + "step": 2428, + "text_loss": 0.5171207189559937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009134967886628573, + "loss": 0.0115, + "macro_f1": 1.0, + "num_tokens": 3917927.0, + "repeat_count": 2.0, + "routers_loss": 0.010895746760070324, + "skip_count": 2.0, + "step": 2430, + "text_loss": 0.2852934002876282 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.417963017317287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009133226957426133, + "loss": 0.0132, + "macro_f1": 0.5492662787437439, + "num_tokens": 3921460.0, + "repeat_count": 2.0, + "routers_loss": 0.04196908697485924, + "skip_count": 0.0, + "step": 2432, + "text_loss": 0.4864770770072937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.0009131484444363324, + "loss": 0.0155, + "macro_f1": 0.3333333432674408, + "num_tokens": 3924662.0, + "repeat_count": 0.0, + "routers_loss": 0.004484197124838829, + "skip_count": 0.0, + "step": 2434, + "text_loss": 0.7568684220314026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009129740348107882, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 3927337.0, + "repeat_count": 0.0, + "routers_loss": 0.004351360257714987, + "skip_count": 2.0, + "step": 2436, + "text_loss": 0.5953161716461182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 11.446140299383622, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009127994669328151, + "loss": 0.0085, + "macro_f1": 0.6122449040412903, + "num_tokens": 3930407.0, + "repeat_count": 0.0, + "routers_loss": 0.01664198748767376, + "skip_count": 4.0, + "step": 2438, + "text_loss": 0.5320524573326111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0009126247408693071, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 3933184.0, + "repeat_count": 0.0, + "routers_loss": 0.0017819046042859554, + "skip_count": 1.0, + "step": 2440, + "text_loss": 0.6051273345947266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0009124498566872204, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 3936620.0, + "repeat_count": 0.0, + "routers_loss": 0.005519696045666933, + "skip_count": 0.0, + "step": 2442, + "text_loss": 0.12987950444221497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.474317581449956, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009122748144535704, + "loss": 0.0111, + "macro_f1": 0.32098764181137085, + "num_tokens": 3940010.0, + "repeat_count": 0.0, + "routers_loss": 0.04543351009488106, + "skip_count": 2.0, + "step": 2444, + "text_loss": 0.4642033576965332 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0009120996142354338, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 3943135.0, + "repeat_count": 0.0, + "routers_loss": 0.00550565542653203, + "skip_count": 0.0, + "step": 2446, + "text_loss": 0.5697627067565918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009119242560999477, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 3946650.0, + "repeat_count": 0.0, + "routers_loss": 0.008842485956847668, + "skip_count": 0.0, + "step": 2448, + "text_loss": 0.17046524584293365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009117487401143095, + "loss": 0.0154, + "macro_f1": 0.6666666865348816, + "num_tokens": 3949470.0, + "repeat_count": 1.0, + "routers_loss": 0.005900127813220024, + "skip_count": 0.0, + "step": 2450, + "text_loss": 0.37260866165161133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009115730663457773, + "loss": 0.0137, + "macro_f1": 1.0, + "num_tokens": 3952546.0, + "repeat_count": 1.0, + "routers_loss": 0.003409258322790265, + "skip_count": 1.0, + "step": 2452, + "text_loss": 0.5308008193969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009113972348616698, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 3955817.0, + "repeat_count": 0.0, + "routers_loss": 0.010098597034811974, + "skip_count": 1.0, + "step": 2454, + "text_loss": 0.39226648211479187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 11.530672145582624, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1640625, + "learning_rate": 0.0009112212457293658, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 3958911.0, + "repeat_count": 0.0, + "routers_loss": 0.08184818178415298, + "skip_count": 0.0, + "step": 2456, + "text_loss": 0.45411455631256104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009110450990163047, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 3962584.0, + "repeat_count": 0.0, + "routers_loss": 0.0009352223132736981, + "skip_count": 0.0, + "step": 2458, + "text_loss": 0.47292324900627136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009108687947899863, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 3965597.0, + "repeat_count": 1.0, + "routers_loss": 0.008150188252329826, + "skip_count": 2.0, + "step": 2460, + "text_loss": 0.33208340406417847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.558849427648958, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009106923331179707, + "loss": 0.0125, + "macro_f1": 0.5492662787437439, + "num_tokens": 3968664.0, + "repeat_count": 0.0, + "routers_loss": 0.050999004393815994, + "skip_count": 2.0, + "step": 2462, + "text_loss": 0.2459995150566101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009105157140678782, + "loss": 0.0126, + "macro_f1": 0.6666666865348816, + "num_tokens": 3971772.0, + "repeat_count": 0.0, + "routers_loss": 0.006196586415171623, + "skip_count": 1.0, + "step": 2464, + "text_loss": 0.23956991732120514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009103389377073896, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 3976224.0, + "repeat_count": 0.0, + "routers_loss": 0.008181816898286343, + "skip_count": 0.0, + "step": 2466, + "text_loss": 0.3235875070095062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009101620041042462, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 3978876.0, + "repeat_count": 0.0, + "routers_loss": 0.0015451472718268633, + "skip_count": 0.0, + "step": 2468, + "text_loss": 0.4038759469985962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.596419137070736, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09130859375, + "learning_rate": 0.000909984913326249, + "loss": 0.0131, + "macro_f1": 0.3272727429866791, + "num_tokens": 3981992.0, + "repeat_count": 0.0, + "routers_loss": 0.021785033866763115, + "skip_count": 1.0, + "step": 2470, + "text_loss": 0.6346460580825806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009098076654412595, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 3984560.0, + "repeat_count": 0.0, + "routers_loss": 0.0011462471447885036, + "skip_count": 0.0, + "step": 2472, + "text_loss": 0.3449646532535553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009096302605171996, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 3987548.0, + "repeat_count": 0.0, + "routers_loss": 0.0014367027906700969, + "skip_count": 0.0, + "step": 2474, + "text_loss": 0.5918350219726562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009094526986220513, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 3990727.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977655088528991, + "skip_count": 0.0, + "step": 2476, + "text_loss": 0.463350385427475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.633988846492516, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009092749798238563, + "loss": 0.015, + "macro_f1": 0.3272727429866791, + "num_tokens": 3993757.0, + "repeat_count": 1.0, + "routers_loss": 0.016712551936507225, + "skip_count": 0.0, + "step": 2478, + "text_loss": 0.5621229410171509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.643381273847961, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.000909097104190717, + "loss": 0.0172, + "macro_f1": 0.32098764181137085, + "num_tokens": 3997259.0, + "repeat_count": 0.0, + "routers_loss": 0.04134179651737213, + "skip_count": 2.0, + "step": 2480, + "text_loss": 0.375476598739624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009089190717907956, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4000563.0, + "repeat_count": 0.0, + "routers_loss": 0.003462378401309252, + "skip_count": 0.0, + "step": 2482, + "text_loss": 0.5553798675537109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009087408826923146, + "loss": 0.0182, + "macro_f1": 0.6666666865348816, + "num_tokens": 4004065.0, + "repeat_count": 0.0, + "routers_loss": 0.008057428523898125, + "skip_count": 2.0, + "step": 2484, + "text_loss": 0.4329465329647064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009085625369635564, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4007119.0, + "repeat_count": 0.0, + "routers_loss": 0.005759050603955984, + "skip_count": 0.0, + "step": 2486, + "text_loss": 0.501268744468689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.680950983269739, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1240234375, + "learning_rate": 0.0009083840346728631, + "loss": 0.0122, + "macro_f1": 0.3272727429866791, + "num_tokens": 4010547.0, + "repeat_count": 1.0, + "routers_loss": 0.020763102918863297, + "skip_count": 0.0, + "step": 2488, + "text_loss": 0.480196475982666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0009082053758886374, + "loss": 0.0117, + "macro_f1": 0.6666666865348816, + "num_tokens": 4014600.0, + "repeat_count": 0.0, + "routers_loss": 0.005801836494356394, + "skip_count": 1.0, + "step": 2490, + "text_loss": 0.18249782919883728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009080265606793416, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 4017964.0, + "repeat_count": 1.0, + "routers_loss": 0.004226063843816519, + "skip_count": 1.0, + "step": 2492, + "text_loss": 0.6573076248168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000907847589113498, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 4020694.0, + "repeat_count": 0.0, + "routers_loss": 0.004281101748347282, + "skip_count": 2.0, + "step": 2494, + "text_loss": 0.3944586217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.000907668461259689, + "loss": 0.0152, + "macro_f1": 0.6666666865348816, + "num_tokens": 4023757.0, + "repeat_count": 0.0, + "routers_loss": 0.008786370046436787, + "skip_count": 1.0, + "step": 2496, + "text_loss": 0.6452898979187012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0009074891771865566, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4026601.0, + "repeat_count": 0.0, + "routers_loss": 0.005209595896303654, + "skip_count": 0.0, + "step": 2498, + "text_loss": 0.9633619785308838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 11.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009073097369628028, + "loss": 0.013, + "macro_f1": 1.0, + "num_tokens": 4030321.0, + "repeat_count": 3.0, + "routers_loss": 0.00860709697008133, + "skip_count": 1.0, + "step": 2500, + "text_loss": 0.48566827178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009071301406571893, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4033234.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 0.0, + "step": 2502, + "text_loss": 0.3771554231643677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000906950388338538, + "loss": 0.0136, + "macro_f1": 0.3333333432674408, + "num_tokens": 4036417.0, + "repeat_count": 0.0, + "routers_loss": 0.0013424850767478347, + "skip_count": 0.0, + "step": 2504, + "text_loss": 0.8962806463241577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09912109375, + "learning_rate": 0.0009067704800757301, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4039564.0, + "repeat_count": 0.0, + "routers_loss": 0.0010423909407109022, + "skip_count": 0.0, + "step": 2506, + "text_loss": 0.43170279264450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.774875256824185, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000906590415937707, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 4043212.0, + "repeat_count": 0.0, + "routers_loss": 0.021780289709568024, + "skip_count": 1.0, + "step": 2508, + "text_loss": 0.41495826840400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009064101959934696, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4046687.0, + "repeat_count": 0.0, + "routers_loss": 0.007261929102241993, + "skip_count": 1.0, + "step": 2510, + "text_loss": 0.21821187436580658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009062298203120783, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4050735.0, + "repeat_count": 0.0, + "routers_loss": 0.007447180338203907, + "skip_count": 2.0, + "step": 2512, + "text_loss": 0.1818767935037613 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.803052538890519, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009060492889626535, + "loss": 0.0142, + "macro_f1": 0.3272727429866791, + "num_tokens": 4054426.0, + "repeat_count": 1.0, + "routers_loss": 0.0718490406870842, + "skip_count": 0.0, + "step": 2514, + "text_loss": 0.22798970341682434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0009058686020143753, + "loss": 0.0183, + "macro_f1": 0.3333333432674408, + "num_tokens": 4057615.0, + "repeat_count": 0.0, + "routers_loss": 0.0052676633931696415, + "skip_count": 0.0, + "step": 2516, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009056877595364832, + "loss": 0.0137, + "macro_f1": 0.3333333432674408, + "num_tokens": 4060338.0, + "repeat_count": 0.0, + "routers_loss": 0.0018052728846669197, + "skip_count": 0.0, + "step": 2518, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0009055067615982761, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4062887.0, + "repeat_count": 0.0, + "routers_loss": 0.0009029926732182503, + "skip_count": 0.0, + "step": 2520, + "text_loss": 0.5480356812477112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009053256082691133, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 4065357.0, + "repeat_count": 0.0, + "routers_loss": 0.0027515271212905645, + "skip_count": 0.0, + "step": 2522, + "text_loss": 0.5234101414680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0009051442996184127, + "loss": 0.0174, + "macro_f1": 0.3333333432674408, + "num_tokens": 4068111.0, + "repeat_count": 0.0, + "routers_loss": 0.002199822571128607, + "skip_count": 0.0, + "step": 2524, + "text_loss": 0.2418575882911682 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009049628357156521, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 4071284.0, + "repeat_count": 0.0, + "routers_loss": 0.006303096655756235, + "skip_count": 2.0, + "step": 2526, + "text_loss": 0.7948065996170044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.868799530378633, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000904781216630369, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 4074750.0, + "repeat_count": 1.0, + "routers_loss": 0.01791904680430889, + "skip_count": 2.0, + "step": 2528, + "text_loss": 0.809726357460022 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 11.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009045994424321602, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4078617.0, + "repeat_count": 2.0, + "routers_loss": 0.016553178429603577, + "skip_count": 2.0, + "step": 2530, + "text_loss": 0.8755000829696655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0009044175131906817, + "loss": 0.0145, + "macro_f1": 0.3333333432674408, + "num_tokens": 4080936.0, + "repeat_count": 0.0, + "routers_loss": 0.00884837657213211, + "skip_count": 0.0, + "step": 2532, + "text_loss": 0.795871913433075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009042354289756491, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4084459.0, + "repeat_count": 0.0, + "routers_loss": 0.0024387789890170097, + "skip_count": 0.0, + "step": 2534, + "text_loss": 0.18875400722026825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0625, + "learning_rate": 0.0009040531898568379, + "loss": 0.0171, + "macro_f1": 0.3333333432674408, + "num_tokens": 4088464.0, + "repeat_count": 0.0, + "routers_loss": 0.00491489190608263, + "skip_count": 0.0, + "step": 2536, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 11.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.000903870795904082, + "loss": 0.0145, + "macro_f1": 0.6666666865348816, + "num_tokens": 4091659.0, + "repeat_count": 0.0, + "routers_loss": 0.004592662677168846, + "skip_count": 2.0, + "step": 2538, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.925154094511301, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.000903688247187275, + "loss": 0.0137, + "macro_f1": 0.5492662787437439, + "num_tokens": 4095496.0, + "repeat_count": 0.0, + "routers_loss": 0.011647242121398449, + "skip_count": 2.0, + "step": 2540, + "text_loss": 0.2985081672668457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009035055437763704, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4098663.0, + "repeat_count": 0.0, + "routers_loss": 0.0021238960325717926, + "skip_count": 0.0, + "step": 2542, + "text_loss": 0.35359489917755127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 11.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0009033226857413803, + "loss": 0.0163, + "macro_f1": 0.6666666865348816, + "num_tokens": 4101588.0, + "repeat_count": 1.0, + "routers_loss": 0.0024701557122170925, + "skip_count": 0.0, + "step": 2544, + "text_loss": 1.1577601432800293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.000903139673152376, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4104643.0, + "repeat_count": 0.0, + "routers_loss": 0.002499542199075222, + "skip_count": 0.0, + "step": 2546, + "text_loss": 1.0173401832580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009029565060794885, + "loss": 0.0165, + "macro_f1": 0.3333333432674408, + "num_tokens": 4109247.0, + "repeat_count": 0.0, + "routers_loss": 0.0034200598020106554, + "skip_count": 0.0, + "step": 2548, + "text_loss": 0.5690504312515259 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 11.972116231288524, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0009027731845929079, + "loss": 0.0155, + "macro_f1": 0.8823530077934265, + "num_tokens": 4112597.0, + "repeat_count": 1.0, + "routers_loss": 0.015981333330273628, + "skip_count": 1.0, + "step": 2550, + "text_loss": 0.294549822807312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 11.981508658643968, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009025897087628829, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 4115844.0, + "repeat_count": 0.0, + "routers_loss": 0.02606951631605625, + "skip_count": 2.0, + "step": 2552, + "text_loss": 0.22692419588565826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 11.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.080078125, + "learning_rate": 0.0009024060786597222, + "loss": 0.0202, + "macro_f1": 0.3333333432674408, + "num_tokens": 4118634.0, + "repeat_count": 0.0, + "routers_loss": 0.001026194542646408, + "skip_count": 0.0, + "step": 2554, + "text_loss": 0.6807059645652771 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000902222294353793, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4122024.0, + "repeat_count": 0.0, + "routers_loss": 0.001974924933165312, + "skip_count": 0.0, + "step": 2556, + "text_loss": 0.7373668551445007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.009392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009020383559155219, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 4124803.0, + "repeat_count": 1.0, + "routers_loss": 0.004662613850086927, + "skip_count": 2.0, + "step": 2558, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.018784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0009018542634153943, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 4127680.0, + "repeat_count": 0.0, + "routers_loss": 0.006881687790155411, + "skip_count": 0.0, + "step": 2560, + "text_loss": 0.25192978978157043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009016700169239551, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 4130431.0, + "repeat_count": 1.0, + "routers_loss": 0.005977808032184839, + "skip_count": 1.0, + "step": 2562, + "text_loss": 0.4700816869735718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0009014856165118075, + "loss": 0.0153, + "macro_f1": 0.6666666865348816, + "num_tokens": 4133535.0, + "repeat_count": 0.0, + "routers_loss": 0.007005698047578335, + "skip_count": 1.0, + "step": 2564, + "text_loss": 0.6558199524879456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0009013010622496144, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4136534.0, + "repeat_count": 0.0, + "routers_loss": 0.007262171246111393, + "skip_count": 0.0, + "step": 2566, + "text_loss": 0.2565421462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.056354564132668, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009011163542080971, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 4139762.0, + "repeat_count": 0.0, + "routers_loss": 0.05431923270225525, + "skip_count": 3.0, + "step": 2568, + "text_loss": 0.19896510243415833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0009009314924580363, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4143398.0, + "repeat_count": 0.0, + "routers_loss": 0.003667369019240141, + "skip_count": 0.0, + "step": 2570, + "text_loss": 0.6581419110298157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009007464770702712, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4146248.0, + "repeat_count": 0.0, + "routers_loss": 0.00132099783513695, + "skip_count": 0.0, + "step": 2572, + "text_loss": 0.5316711068153381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0009005613081157002, + "loss": 0.0132, + "macro_f1": 0.3333333432674408, + "num_tokens": 4149455.0, + "repeat_count": 0.0, + "routers_loss": 0.0020061524119228125, + "skip_count": 0.0, + "step": 2574, + "text_loss": 0.5400773882865906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009003759856652802, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4152774.0, + "repeat_count": 0.0, + "routers_loss": 0.002621434163302183, + "skip_count": 1.0, + "step": 2576, + "text_loss": 0.3672606945037842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009001905097900273, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4155835.0, + "repeat_count": 0.0, + "routers_loss": 0.005290219560265541, + "skip_count": 0.0, + "step": 2578, + "text_loss": 0.8159038424491882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009000048805610161, + "loss": 0.0119, + "macro_f1": 0.3333333432674408, + "num_tokens": 4158874.0, + "repeat_count": 0.0, + "routers_loss": 0.0013576085912063718, + "skip_count": 0.0, + "step": 2580, + "text_loss": 0.5518951416015625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.138671875, + "learning_rate": 0.00089981909804938, + "loss": 0.0143, + "macro_f1": 0.3333333432674408, + "num_tokens": 4162076.0, + "repeat_count": 0.0, + "routers_loss": 0.0021483441814780235, + "skip_count": 0.0, + "step": 2582, + "text_loss": 0.43552228808403015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.25, + "avg_layers": 28.0, + "epoch": 12.131493982976226, + "f1_execute": 0.9387754797935486, + "f1_repeat": 1.0, + "f1_skip": 0.4000000059604645, + "grad_norm": 0.068359375, + "learning_rate": 0.0008996331623263114, + "loss": 0.0117, + "macro_f1": 0.7795917987823486, + "num_tokens": 4165041.0, + "repeat_count": 1.0, + "routers_loss": 0.0544300302863121, + "skip_count": 4.0, + "step": 2584, + "text_loss": 0.24812501668930054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008994470734630611, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4168290.0, + "repeat_count": 0.0, + "routers_loss": 0.0017150711501017213, + "skip_count": 0.0, + "step": 2586, + "text_loss": 0.6392097473144531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008992608315309388, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4171310.0, + "repeat_count": 0.0, + "routers_loss": 0.0046473173424601555, + "skip_count": 2.0, + "step": 2588, + "text_loss": 0.6534156799316406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.15967126504256, + "f1_execute": 0.943396270275116, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008990744366013125, + "loss": 0.0105, + "macro_f1": 0.3144654333591461, + "num_tokens": 4174042.0, + "repeat_count": 2.0, + "routers_loss": 0.060913100838661194, + "skip_count": 1.0, + "step": 2590, + "text_loss": 0.5365690588951111 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 12.169063692398003, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008988878887456093, + "loss": 0.0118, + "macro_f1": 0.6051587462425232, + "num_tokens": 4177666.0, + "repeat_count": 1.0, + "routers_loss": 0.06268956512212753, + "skip_count": 4.0, + "step": 2592, + "text_loss": 0.226226806640625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.178456119753449, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008987011880353149, + "loss": 0.0089, + "macro_f1": 0.32098764181137085, + "num_tokens": 4180490.0, + "repeat_count": 0.0, + "routers_loss": 0.030141465365886688, + "skip_count": 2.0, + "step": 2594, + "text_loss": 0.2581401765346527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.187848547108894, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008985143345419729, + "loss": 0.0082, + "macro_f1": 0.5492662787437439, + "num_tokens": 4183300.0, + "repeat_count": 0.0, + "routers_loss": 0.018745863810181618, + "skip_count": 2.0, + "step": 2596, + "text_loss": 0.7778542637825012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.197240974464338, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.064453125, + "learning_rate": 0.0008983273283371862, + "loss": 0.0096, + "macro_f1": 0.5492662787437439, + "num_tokens": 4186535.0, + "repeat_count": 0.0, + "routers_loss": 0.026792079210281372, + "skip_count": 2.0, + "step": 2598, + "text_loss": 0.34700271487236023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008981401694926159, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4189082.0, + "repeat_count": 0.0, + "routers_loss": 0.001914160675369203, + "skip_count": 0.0, + "step": 2600, + "text_loss": 0.6879339218139648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008979528580799815, + "loss": 0.0136, + "macro_f1": 0.6666666865348816, + "num_tokens": 4192330.0, + "repeat_count": 0.0, + "routers_loss": 0.007978348061442375, + "skip_count": 2.0, + "step": 2602, + "text_loss": 0.3524550497531891 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 12.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008977653941710613, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4196117.0, + "repeat_count": 2.0, + "routers_loss": 0.0035376469604671, + "skip_count": 0.0, + "step": 2604, + "text_loss": 0.42356348037719727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008975777778376916, + "loss": 0.0156, + "macro_f1": 0.6666666865348816, + "num_tokens": 4200423.0, + "repeat_count": 0.0, + "routers_loss": 0.008262477815151215, + "skip_count": 1.0, + "step": 2606, + "text_loss": 0.5272893905639648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.244203111241562, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0008973900091517675, + "loss": 0.0114, + "macro_f1": 0.3272727429866791, + "num_tokens": 4203257.0, + "repeat_count": 0.0, + "routers_loss": 0.022957922890782356, + "skip_count": 1.0, + "step": 2608, + "text_loss": 0.2713734805583954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.253595538597006, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.000897202088185242, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 4206243.0, + "repeat_count": 0.0, + "routers_loss": 0.006623407825827599, + "skip_count": 2.0, + "step": 2610, + "text_loss": 0.5920525789260864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008970140150101274, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4209264.0, + "repeat_count": 0.0, + "routers_loss": 0.0008602747693657875, + "skip_count": 0.0, + "step": 2612, + "text_loss": 0.33421996235847473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0008968257896984932, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 4212058.0, + "repeat_count": 0.0, + "routers_loss": 0.0024653903674334288, + "skip_count": 1.0, + "step": 2614, + "text_loss": 0.37923356890678406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008966374123224677, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4214929.0, + "repeat_count": 0.0, + "routers_loss": 0.010878405533730984, + "skip_count": 0.0, + "step": 2616, + "text_loss": 0.4350503981113434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008964488829542376, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4219170.0, + "repeat_count": 0.0, + "routers_loss": 0.02864212542772293, + "skip_count": 1.0, + "step": 2618, + "text_loss": 0.26250728964805603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008962602016660478, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4222077.0, + "repeat_count": 0.0, + "routers_loss": 0.010444172658026218, + "skip_count": 2.0, + "step": 2620, + "text_loss": 0.4718937575817108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008960713685302011, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4225383.0, + "repeat_count": 0.0, + "routers_loss": 0.006409442983567715, + "skip_count": 1.0, + "step": 2622, + "text_loss": 0.30420538783073425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.31934253008512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0008958823836190588, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 4228349.0, + "repeat_count": 0.0, + "routers_loss": 0.009996986016631126, + "skip_count": 1.0, + "step": 2624, + "text_loss": 0.5392362475395203 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008956932470050404, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 4232007.0, + "repeat_count": 0.0, + "routers_loss": 0.0014383369125425816, + "skip_count": 0.0, + "step": 2626, + "text_loss": 0.7112401127815247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008955039587606233, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4235122.0, + "repeat_count": 0.0, + "routers_loss": 0.00781513936817646, + "skip_count": 3.0, + "step": 2628, + "text_loss": 0.17802883684635162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 12.347519812151454, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008953145189583429, + "loss": 0.0126, + "macro_f1": 0.542222261428833, + "num_tokens": 4238248.0, + "repeat_count": 0.0, + "routers_loss": 0.062252625823020935, + "skip_count": 4.0, + "step": 2630, + "text_loss": 0.5551572442054749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008951249276707933, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4241042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011421777307987213, + "skip_count": 0.0, + "step": 2632, + "text_loss": 0.7092233896255493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.366304666862343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008949351849706261, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4243939.0, + "repeat_count": 0.0, + "routers_loss": 0.0032689040526747704, + "skip_count": 0.0, + "step": 2634, + "text_loss": 0.19925718009471893 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008947452909305509, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 4247535.0, + "repeat_count": 1.0, + "routers_loss": 0.002066014800220728, + "skip_count": 0.0, + "step": 2636, + "text_loss": 0.5249715447425842 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 12.385089521573232, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.09326171875, + "learning_rate": 0.0008945552456233356, + "loss": 0.0169, + "macro_f1": 0.8820862174034119, + "num_tokens": 4251441.0, + "repeat_count": 2.0, + "routers_loss": 0.029332537204027176, + "skip_count": 2.0, + "step": 2638, + "text_loss": 0.19229578971862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.078125, + "learning_rate": 0.0008943650491218058, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4254314.0, + "repeat_count": 0.0, + "routers_loss": 0.0075911120511591434, + "skip_count": 0.0, + "step": 2640, + "text_loss": 0.27059751749038696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008941747014988453, + "loss": 0.0156, + "macro_f1": 0.3333333432674408, + "num_tokens": 4257442.0, + "repeat_count": 0.0, + "routers_loss": 0.009030844084918499, + "skip_count": 0.0, + "step": 2642, + "text_loss": 0.36747801303863525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.123046875, + "learning_rate": 0.0008939842028273956, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4260386.0, + "repeat_count": 0.0, + "routers_loss": 0.007844001986086369, + "skip_count": 1.0, + "step": 2644, + "text_loss": 0.6397647857666016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008937935531804562, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4263516.0, + "repeat_count": 0.0, + "routers_loss": 0.0018789108144119382, + "skip_count": 0.0, + "step": 2646, + "text_loss": 0.4795534908771515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.432051658350455, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008936027526310844, + "loss": 0.0098, + "macro_f1": 0.3272727429866791, + "num_tokens": 4266744.0, + "repeat_count": 0.0, + "routers_loss": 0.0348590686917305, + "skip_count": 1.0, + "step": 2648, + "text_loss": 0.27691999077796936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.000893411801252395, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4269766.0, + "repeat_count": 0.0, + "routers_loss": 0.004543309565633535, + "skip_count": 1.0, + "step": 2650, + "text_loss": 0.18867231905460358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008932206991175615, + "loss": 0.0141, + "macro_f1": 0.6666666865348816, + "num_tokens": 4273513.0, + "repeat_count": 0.0, + "routers_loss": 0.0035277456045150757, + "skip_count": 1.0, + "step": 2652, + "text_loss": 0.45613357424736023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.460228940416789, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008930294462998143, + "loss": 0.015, + "macro_f1": 0.6666666865348816, + "num_tokens": 4276878.0, + "repeat_count": 1.0, + "routers_loss": 0.011337592266499996, + "skip_count": 0.0, + "step": 2654, + "text_loss": 0.24733254313468933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008928380428724419, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4279915.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295971296727657, + "skip_count": 1.0, + "step": 2656, + "text_loss": 0.41722849011421204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008926464889087903, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4282888.0, + "repeat_count": 0.0, + "routers_loss": 0.0017198545392602682, + "skip_count": 2.0, + "step": 2658, + "text_loss": 0.738322377204895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008924547844822634, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4285805.0, + "repeat_count": 0.0, + "routers_loss": 0.001339946174994111, + "skip_count": 0.0, + "step": 2660, + "text_loss": 0.4802379906177521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.000892262929666323, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4290282.0, + "repeat_count": 0.0, + "routers_loss": 0.0022340165451169014, + "skip_count": 0.0, + "step": 2662, + "text_loss": 0.6503544449806213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008920709245344878, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4294106.0, + "repeat_count": 0.0, + "routers_loss": 0.005288850050419569, + "skip_count": 1.0, + "step": 2664, + "text_loss": 0.12312037497758865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008918787691603347, + "loss": 0.0121, + "macro_f1": 0.6666666865348816, + "num_tokens": 4298013.0, + "repeat_count": 0.0, + "routers_loss": 0.004259659443050623, + "skip_count": 1.0, + "step": 2666, + "text_loss": 0.3070000112056732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.000891686463617498, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 4300799.0, + "repeat_count": 0.0, + "routers_loss": 0.009489355608820915, + "skip_count": 1.0, + "step": 2668, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008914940079796696, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4304641.0, + "repeat_count": 0.0, + "routers_loss": 0.0025417013093829155, + "skip_count": 0.0, + "step": 2670, + "text_loss": 0.482585072517395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.544760786615791, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008913014023205988, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4307462.0, + "repeat_count": 0.0, + "routers_loss": 0.006371749565005302, + "skip_count": 0.0, + "step": 2672, + "text_loss": 0.7064456939697266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008911086467140925, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4310396.0, + "repeat_count": 0.0, + "routers_loss": 0.0027512952219694853, + "skip_count": 0.0, + "step": 2674, + "text_loss": 0.23532851040363312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.000890915741234015, + "loss": 0.0133, + "macro_f1": 0.6666666865348816, + "num_tokens": 4314781.0, + "repeat_count": 0.0, + "routers_loss": 0.008253013715147972, + "skip_count": 1.0, + "step": 2676, + "text_loss": 0.30950358510017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008907226859542879, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4317988.0, + "repeat_count": 0.0, + "routers_loss": 0.005409995559602976, + "skip_count": 2.0, + "step": 2678, + "text_loss": 0.4930732846260071 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0008905294809488907, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 4321014.0, + "repeat_count": 1.0, + "routers_loss": 0.0029942214023321867, + "skip_count": 1.0, + "step": 2680, + "text_loss": 0.6224040389060974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008903361262918595, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4324268.0, + "repeat_count": 0.0, + "routers_loss": 0.008411120623350143, + "skip_count": 1.0, + "step": 2682, + "text_loss": 0.16296671330928802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 12.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008901426220572884, + "loss": 0.0138, + "macro_f1": 1.0, + "num_tokens": 4327494.0, + "repeat_count": 2.0, + "routers_loss": 0.01039006095379591, + "skip_count": 4.0, + "step": 2684, + "text_loss": 0.43866512179374695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008899489683193286, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4330936.0, + "repeat_count": 0.0, + "routers_loss": 0.0009329111780971289, + "skip_count": 0.0, + "step": 2686, + "text_loss": 0.44250962138175964 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008897551651521885, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4334123.0, + "repeat_count": 0.0, + "routers_loss": 0.003197216661646962, + "skip_count": 0.0, + "step": 2688, + "text_loss": 0.48313501477241516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008895612126301339, + "loss": 0.0157, + "macro_f1": 0.3333333432674408, + "num_tokens": 4337610.0, + "repeat_count": 0.0, + "routers_loss": 0.0033548236824572086, + "skip_count": 0.0, + "step": 2690, + "text_loss": 0.4715327322483063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008893671108274877, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4341026.0, + "repeat_count": 0.0, + "routers_loss": 0.0024757643695920706, + "skip_count": 0.0, + "step": 2692, + "text_loss": 0.43402785062789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008891728598186302, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 4344422.0, + "repeat_count": 0.0, + "routers_loss": 0.003317243419587612, + "skip_count": 0.0, + "step": 2694, + "text_loss": 0.8498559594154358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 12.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008889784596779986, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 4347507.0, + "repeat_count": 0.0, + "routers_loss": 0.01577926240861416, + "skip_count": 3.0, + "step": 2696, + "text_loss": 0.5646669864654541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008887839104800876, + "loss": 0.0124, + "macro_f1": 0.3333333432674408, + "num_tokens": 4350414.0, + "repeat_count": 0.0, + "routers_loss": 0.002953822258859873, + "skip_count": 0.0, + "step": 2698, + "text_loss": 0.5145012140274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008885892122994486, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4354110.0, + "repeat_count": 0.0, + "routers_loss": 0.005849295295774937, + "skip_count": 0.0, + "step": 2700, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008883943652106903, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 4357323.0, + "repeat_count": 1.0, + "routers_loss": 0.012347398325800896, + "skip_count": 2.0, + "step": 2702, + "text_loss": 0.2234988808631897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008881993692884787, + "loss": 0.0128, + "macro_f1": 0.6666666865348816, + "num_tokens": 4360228.0, + "repeat_count": 0.0, + "routers_loss": 0.003574999049305916, + "skip_count": 1.0, + "step": 2704, + "text_loss": 0.4261806607246399 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008880042246075365, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4363905.0, + "repeat_count": 0.0, + "routers_loss": 0.0031574300955981016, + "skip_count": 0.0, + "step": 2706, + "text_loss": 0.691118061542511 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008878089312426433, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4366736.0, + "repeat_count": 0.0, + "routers_loss": 0.003195564029738307, + "skip_count": 0.0, + "step": 2708, + "text_loss": 0.613926112651825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 25.0, + "epoch": 12.72321690636924, + "f1_execute": 0.9583333134651184, + "f1_repeat": 0.0, + "f1_skip": 0.75, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008876134892686363, + "loss": 0.011, + "macro_f1": 0.5694444179534912, + "num_tokens": 4370146.0, + "repeat_count": 0.0, + "routers_loss": 0.038784291595220566, + "skip_count": 5.0, + "step": 2710, + "text_loss": 0.2723451852798462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0830078125, + "learning_rate": 0.000887417898760409, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 4373653.0, + "repeat_count": 0.0, + "routers_loss": 0.0006457131239585578, + "skip_count": 0.0, + "step": 2712, + "text_loss": 0.31667640805244446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.742001761080129, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10498046875, + "learning_rate": 0.000887222159792912, + "loss": 0.0155, + "macro_f1": 0.6603773832321167, + "num_tokens": 4376993.0, + "repeat_count": 1.0, + "routers_loss": 0.045078590512275696, + "skip_count": 1.0, + "step": 2714, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008870262724411528, + "loss": 0.012, + "macro_f1": 0.3333333432674408, + "num_tokens": 4380160.0, + "repeat_count": 0.0, + "routers_loss": 0.003628545207902789, + "skip_count": 0.0, + "step": 2716, + "text_loss": 0.7468157410621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.760786615791018, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008868302367801962, + "loss": 0.0118, + "macro_f1": 0.6598639488220215, + "num_tokens": 4383100.0, + "repeat_count": 1.0, + "routers_loss": 0.05404464527964592, + "skip_count": 3.0, + "step": 2718, + "text_loss": 0.2970244884490967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008866340528851629, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4386700.0, + "repeat_count": 0.0, + "routers_loss": 0.007000274024903774, + "skip_count": 0.0, + "step": 2720, + "text_loss": 0.34521186351776123 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.779571470501908, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008864377208312313, + "loss": 0.0082, + "macro_f1": 0.8823530077934265, + "num_tokens": 4390299.0, + "repeat_count": 1.0, + "routers_loss": 0.02025366574525833, + "skip_count": 2.0, + "step": 2722, + "text_loss": 1.0536936521530151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.000886241240693636, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 4393353.0, + "repeat_count": 0.0, + "routers_loss": 0.00251673418097198, + "skip_count": 0.0, + "step": 2724, + "text_loss": 0.5678093433380127 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008860446125476686, + "loss": 0.0135, + "macro_f1": 0.6666666865348816, + "num_tokens": 4396446.0, + "repeat_count": 1.0, + "routers_loss": 0.009532532654702663, + "skip_count": 0.0, + "step": 2726, + "text_loss": 0.23775041103363037 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0008858478364686776, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 4399977.0, + "repeat_count": 1.0, + "routers_loss": 0.008062181062996387, + "skip_count": 0.0, + "step": 2728, + "text_loss": 0.18888695538043976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008856509125320678, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 4404406.0, + "repeat_count": 0.0, + "routers_loss": 0.0007731119985692203, + "skip_count": 0.0, + "step": 2730, + "text_loss": 0.47331541776657104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008854538408133006, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 4407165.0, + "repeat_count": 0.0, + "routers_loss": 0.003115242812782526, + "skip_count": 1.0, + "step": 2732, + "text_loss": 0.491370290517807 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008852566213878947, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4410101.0, + "repeat_count": 0.0, + "routers_loss": 0.0008958528051152825, + "skip_count": 0.0, + "step": 2734, + "text_loss": 0.42188262939453125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 12.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008850592543314246, + "loss": 0.0118, + "macro_f1": 1.0, + "num_tokens": 4413015.0, + "repeat_count": 1.0, + "routers_loss": 0.01139112375676632, + "skip_count": 1.0, + "step": 2736, + "text_loss": 0.4716498553752899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.854710889345466, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008848617397195218, + "loss": 0.0084, + "macro_f1": 0.6603773832321167, + "num_tokens": 4416404.0, + "repeat_count": 1.0, + "routers_loss": 0.01609630137681961, + "skip_count": 1.0, + "step": 2738, + "text_loss": 0.19490821659564972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008846640776278745, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 4419408.0, + "repeat_count": 0.0, + "routers_loss": 0.001489170710556209, + "skip_count": 0.0, + "step": 2740, + "text_loss": 0.6443108320236206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008844662681322269, + "loss": 0.0144, + "macro_f1": 0.6666666865348816, + "num_tokens": 4422067.0, + "repeat_count": 1.0, + "routers_loss": 0.0014755792217329144, + "skip_count": 0.0, + "step": 2742, + "text_loss": 0.9150356650352478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008842683113083801, + "loss": 0.0149, + "macro_f1": 0.6666666865348816, + "num_tokens": 4425647.0, + "repeat_count": 0.0, + "routers_loss": 0.008962674997746944, + "skip_count": 1.0, + "step": 2744, + "text_loss": 0.7103227972984314 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 12.892280598767243, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008840702072321915, + "loss": 0.0104, + "macro_f1": 0.6598639488220215, + "num_tokens": 4428855.0, + "repeat_count": 1.0, + "routers_loss": 0.02554207295179367, + "skip_count": 3.0, + "step": 2746, + "text_loss": 0.27141591906547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0008838719559795751, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4432838.0, + "repeat_count": 0.0, + "routers_loss": 0.0011747616808861494, + "skip_count": 0.0, + "step": 2748, + "text_loss": 0.4007738530635834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 12.911065453478134, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008836735576265009, + "loss": 0.0073, + "macro_f1": 0.5492662787437439, + "num_tokens": 4435793.0, + "repeat_count": 0.0, + "routers_loss": 0.017564335837960243, + "skip_count": 2.0, + "step": 2750, + "text_loss": 0.5972410440444946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008834750122489956, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 4438871.0, + "repeat_count": 1.0, + "routers_loss": 0.007004009559750557, + "skip_count": 0.0, + "step": 2752, + "text_loss": 0.2294853925704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008832763199231423, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 4441846.0, + "repeat_count": 0.0, + "routers_loss": 0.0014562139986082911, + "skip_count": 0.0, + "step": 2754, + "text_loss": 0.722432017326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.939242735544468, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008830774807250802, + "loss": 0.013, + "macro_f1": 0.3272727429866791, + "num_tokens": 4444786.0, + "repeat_count": 1.0, + "routers_loss": 0.024773593991994858, + "skip_count": 0.0, + "step": 2756, + "text_loss": 0.507905125617981 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 12.948635162899912, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008828784947310049, + "loss": 0.0129, + "macro_f1": 0.8823530077934265, + "num_tokens": 4448442.0, + "repeat_count": 1.0, + "routers_loss": 0.04959975928068161, + "skip_count": 2.0, + "step": 2758, + "text_loss": 0.3617522418498993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 12.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.1025390625, + "learning_rate": 0.000882679362017168, + "loss": 0.0149, + "macro_f1": 1.0, + "num_tokens": 4451401.0, + "repeat_count": 1.0, + "routers_loss": 0.005783245898783207, + "skip_count": 2.0, + "step": 2760, + "text_loss": 0.49187400937080383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0791015625, + "learning_rate": 0.0008824800826598778, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 4454537.0, + "repeat_count": 0.0, + "routers_loss": 0.00656260596588254, + "skip_count": 0.0, + "step": 2762, + "text_loss": 0.6823583245277405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 12.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0008822806567354983, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 4457706.0, + "repeat_count": 1.0, + "routers_loss": 0.005298966076225042, + "skip_count": 0.0, + "step": 2764, + "text_loss": 0.554322361946106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.986204872321691, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008820810843204501, + "loss": 0.0096, + "macro_f1": 0.3272727429866791, + "num_tokens": 4460710.0, + "repeat_count": 0.0, + "routers_loss": 0.03164982795715332, + "skip_count": 1.0, + "step": 2766, + "text_loss": 0.1656961441040039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 12.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0008818813654912095, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4464001.0, + "repeat_count": 0.0, + "routers_loss": 0.000715116853825748, + "skip_count": 0.0, + "step": 2768, + "text_loss": 0.5818144083023071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008816815003243093, + "loss": 0.0133, + "macro_f1": 0.3333333432674408, + "num_tokens": 4467364.0, + "repeat_count": 0.0, + "routers_loss": 0.002851625671610236, + "skip_count": 0.0, + "step": 2770, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008814814888963383, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4470681.0, + "repeat_count": 0.0, + "routers_loss": 0.004729873035103083, + "skip_count": 1.0, + "step": 2772, + "text_loss": 0.5386646389961243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.000881281331283941, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4473734.0, + "repeat_count": 0.0, + "routers_loss": 0.0031853127293288708, + "skip_count": 1.0, + "step": 2774, + "text_loss": 0.5695263147354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008810810275638182, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4478404.0, + "repeat_count": 0.0, + "routers_loss": 0.0008977465913631022, + "skip_count": 0.0, + "step": 2776, + "text_loss": 0.4750773310661316 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008808805778127269, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4481287.0, + "repeat_count": 0.0, + "routers_loss": 0.00469845999032259, + "skip_count": 0.0, + "step": 2778, + "text_loss": 0.14078612625598907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.051658350454945, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008806799821074796, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 4483929.0, + "repeat_count": 0.0, + "routers_loss": 0.01789761893451214, + "skip_count": 2.0, + "step": 2780, + "text_loss": 0.2167191207408905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008804792405249451, + "loss": 0.0123, + "macro_f1": 0.3333333432674408, + "num_tokens": 4487468.0, + "repeat_count": 0.0, + "routers_loss": 0.001018838956952095, + "skip_count": 0.0, + "step": 2782, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 13.070443205165835, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.07373046875, + "learning_rate": 0.000880278353142048, + "loss": 0.0077, + "macro_f1": 0.8200000524520874, + "num_tokens": 4490942.0, + "repeat_count": 1.0, + "routers_loss": 0.03260354697704315, + "skip_count": 3.0, + "step": 2784, + "text_loss": 0.20994654297828674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008800773200357683, + "loss": 0.0122, + "macro_f1": 0.3333333432674408, + "num_tokens": 4493986.0, + "repeat_count": 0.0, + "routers_loss": 0.003019835101440549, + "skip_count": 0.0, + "step": 2786, + "text_loss": 0.5709528923034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008798761412831429, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4498232.0, + "repeat_count": 0.0, + "routers_loss": 0.00285192858427763, + "skip_count": 0.0, + "step": 2788, + "text_loss": 0.5103896260261536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0008796748169612634, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4501231.0, + "repeat_count": 0.0, + "routers_loss": 0.0012469831854104996, + "skip_count": 0.0, + "step": 2790, + "text_loss": 0.43669697642326355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0008794733471472778, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4504208.0, + "repeat_count": 0.0, + "routers_loss": 0.011512776836752892, + "skip_count": 1.0, + "step": 2792, + "text_loss": 0.2299770563840866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008792717319183899, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4507013.0, + "repeat_count": 0.0, + "routers_loss": 0.00834917277097702, + "skip_count": 0.0, + "step": 2794, + "text_loss": 0.2130603939294815 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008790699713518587, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 4510286.0, + "repeat_count": 0.0, + "routers_loss": 0.008616939187049866, + "skip_count": 2.0, + "step": 2796, + "text_loss": 0.4377101957798004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0008788680655249994, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4513762.0, + "repeat_count": 0.0, + "routers_loss": 0.003408568911254406, + "skip_count": 0.0, + "step": 2798, + "text_loss": 0.435138463973999 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008786660145151826, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 4516696.0, + "repeat_count": 1.0, + "routers_loss": 0.0029398901388049126, + "skip_count": 0.0, + "step": 2800, + "text_loss": 0.3195655047893524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008784638183998348, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4519760.0, + "repeat_count": 0.0, + "routers_loss": 0.0013777425047010183, + "skip_count": 0.0, + "step": 2802, + "text_loss": 0.8129430413246155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008782614772564379, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4522106.0, + "repeat_count": 0.0, + "routers_loss": 0.0031694830395281315, + "skip_count": 0.0, + "step": 2804, + "text_loss": 0.18083660304546356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0008780589911625293, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 4525743.0, + "repeat_count": 0.0, + "routers_loss": 0.002161208540201187, + "skip_count": 0.0, + "step": 2806, + "text_loss": 0.8228182792663574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008778563601957021, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 4529573.0, + "repeat_count": 0.0, + "routers_loss": 0.0028444856870919466, + "skip_count": 1.0, + "step": 2808, + "text_loss": 0.3715563118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008776535844336049, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4532452.0, + "repeat_count": 0.0, + "routers_loss": 0.003807213855907321, + "skip_count": 0.0, + "step": 2810, + "text_loss": 0.6012523174285889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008774506639539417, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 4536077.0, + "repeat_count": 0.0, + "routers_loss": 0.006698979996144772, + "skip_count": 0.0, + "step": 2812, + "text_loss": 0.27097949385643005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008772475988344722, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 4539057.0, + "repeat_count": 0.0, + "routers_loss": 0.004849409218877554, + "skip_count": 1.0, + "step": 2814, + "text_loss": 1.026973843574524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 13.22072204285295, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008770443891530109, + "loss": 0.0115, + "macro_f1": 0.5934640765190125, + "num_tokens": 4542253.0, + "repeat_count": 0.0, + "routers_loss": 0.019148651510477066, + "skip_count": 3.0, + "step": 2816, + "text_loss": 0.2717585563659668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.230114470208395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008768410349874286, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 4545047.0, + "repeat_count": 1.0, + "routers_loss": 0.02231316640973091, + "skip_count": 2.0, + "step": 2818, + "text_loss": 0.274346262216568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008766375364156508, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 4548371.0, + "repeat_count": 0.0, + "routers_loss": 0.008014129474759102, + "skip_count": 2.0, + "step": 2820, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008764338935156586, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4551276.0, + "repeat_count": 0.0, + "routers_loss": 0.0014544493751600385, + "skip_count": 0.0, + "step": 2822, + "text_loss": 0.6308462023735046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000876230106365488, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 4554143.0, + "repeat_count": 0.0, + "routers_loss": 0.00818584579974413, + "skip_count": 3.0, + "step": 2824, + "text_loss": 0.3484207093715668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008760261750432312, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 4557256.0, + "repeat_count": 0.0, + "routers_loss": 0.006275608204305172, + "skip_count": 3.0, + "step": 2826, + "text_loss": 0.1927330046892166 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008758220996270348, + "loss": 0.0103, + "macro_f1": 1.0, + "num_tokens": 4560202.0, + "repeat_count": 2.0, + "routers_loss": 0.0055974251590669155, + "skip_count": 2.0, + "step": 2828, + "text_loss": 0.7796496748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008756178801951007, + "loss": 0.0129, + "macro_f1": 0.3333333432674408, + "num_tokens": 4563508.0, + "repeat_count": 0.0, + "routers_loss": 0.0019799957517534494, + "skip_count": 0.0, + "step": 2830, + "text_loss": 0.49633297324180603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008754135168256865, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4566776.0, + "repeat_count": 0.0, + "routers_loss": 0.004538947716355324, + "skip_count": 0.0, + "step": 2832, + "text_loss": 0.5346745252609253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008752090095971044, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 4569787.0, + "repeat_count": 0.0, + "routers_loss": 0.001663343166001141, + "skip_count": 0.0, + "step": 2834, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.000875004358587722, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 4572813.0, + "repeat_count": 0.0, + "routers_loss": 0.0022988212294876575, + "skip_count": 0.0, + "step": 2836, + "text_loss": 0.4232870042324066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000874799563875962, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4575563.0, + "repeat_count": 0.0, + "routers_loss": 0.007781553082168102, + "skip_count": 1.0, + "step": 2838, + "text_loss": 0.19239822030067444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 13.333431171118287, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0008745946255403021, + "loss": 0.0072, + "macro_f1": 0.5492662787437439, + "num_tokens": 4578117.0, + "repeat_count": 0.0, + "routers_loss": 0.01872488670051098, + "skip_count": 2.0, + "step": 2840, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008743895436592749, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 4582330.0, + "repeat_count": 1.0, + "routers_loss": 0.005634195636957884, + "skip_count": 1.0, + "step": 2842, + "text_loss": 0.4929640591144562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008741843183114685, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4585765.0, + "repeat_count": 0.0, + "routers_loss": 0.0008928569150157273, + "skip_count": 0.0, + "step": 2844, + "text_loss": 0.32702967524528503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008739789495755253, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4589000.0, + "repeat_count": 0.0, + "routers_loss": 0.014715569093823433, + "skip_count": 4.0, + "step": 2846, + "text_loss": 0.25125816464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008737734375301433, + "loss": 0.0135, + "macro_f1": 0.3333333432674408, + "num_tokens": 4592391.0, + "repeat_count": 0.0, + "routers_loss": 0.0017551190685480833, + "skip_count": 0.0, + "step": 2848, + "text_loss": 0.6595172882080078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008735677822540749, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4596662.0, + "repeat_count": 0.0, + "routers_loss": 0.0006456313421949744, + "skip_count": 0.0, + "step": 2850, + "text_loss": 0.6290773153305054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008733619838261276, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 4599682.0, + "repeat_count": 0.0, + "routers_loss": 0.00765060493722558, + "skip_count": 2.0, + "step": 2852, + "text_loss": 0.3268161416053772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008731560423251637, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 4603324.0, + "repeat_count": 1.0, + "routers_loss": 0.01161442045122385, + "skip_count": 2.0, + "step": 2854, + "text_loss": 0.3029932975769043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 13.408570589961844, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008729499578301005, + "loss": 0.0098, + "macro_f1": 0.9555556178092957, + "num_tokens": 4606975.0, + "repeat_count": 1.0, + "routers_loss": 0.02055389992892742, + "skip_count": 5.0, + "step": 2856, + "text_loss": 0.6268532872200012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00087274373041991, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 4609629.0, + "repeat_count": 0.0, + "routers_loss": 0.0013911726418882608, + "skip_count": 0.0, + "step": 2858, + "text_loss": 0.534355640411377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008725373601736188, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 4612913.0, + "repeat_count": 2.0, + "routers_loss": 0.01010701060295105, + "skip_count": 0.0, + "step": 2860, + "text_loss": 0.3391380310058594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0008723308471703085, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4616718.0, + "repeat_count": 0.0, + "routers_loss": 0.005969462916254997, + "skip_count": 1.0, + "step": 2862, + "text_loss": 0.47250816226005554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.446140299383622, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008721241914891152, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4619680.0, + "repeat_count": 0.0, + "routers_loss": 0.0027780034579336643, + "skip_count": 0.0, + "step": 2864, + "text_loss": 0.3249278664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.455532726739067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008719173932092295, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 4622700.0, + "repeat_count": 0.0, + "routers_loss": 0.0015912104863673449, + "skip_count": 0.0, + "step": 2866, + "text_loss": 0.7789985537528992 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008717104524098973, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 4626637.0, + "repeat_count": 0.0, + "routers_loss": 0.0036539011634886265, + "skip_count": 0.0, + "step": 2868, + "text_loss": 0.619088351726532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008715033691704187, + "loss": 0.0118, + "macro_f1": 0.6666666865348816, + "num_tokens": 4629863.0, + "repeat_count": 0.0, + "routers_loss": 0.008402476087212563, + "skip_count": 1.0, + "step": 2870, + "text_loss": 0.5550018548965454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008712961435701479, + "loss": 0.0161, + "macro_f1": 0.6666666865348816, + "num_tokens": 4632657.0, + "repeat_count": 0.0, + "routers_loss": 0.01400839351117611, + "skip_count": 1.0, + "step": 2872, + "text_loss": 0.17368625104427338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008710887756884947, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4635885.0, + "repeat_count": 0.0, + "routers_loss": 0.0014573842054232955, + "skip_count": 0.0, + "step": 2874, + "text_loss": 0.5138643383979797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008708812656049225, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 4639341.0, + "repeat_count": 0.0, + "routers_loss": 0.002810224425047636, + "skip_count": 1.0, + "step": 2876, + "text_loss": 0.70310378074646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.511887290871735, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008706736133989497, + "loss": 0.0105, + "macro_f1": 0.9449735879898071, + "num_tokens": 4642163.0, + "repeat_count": 2.0, + "routers_loss": 0.029783209785819054, + "skip_count": 4.0, + "step": 2878, + "text_loss": 0.26898008584976196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008704658191501491, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 4645858.0, + "repeat_count": 0.0, + "routers_loss": 0.0009193966398015618, + "skip_count": 0.0, + "step": 2880, + "text_loss": 0.6047570705413818 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 13.530672145582624, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008702578829381475, + "loss": 0.0131, + "macro_f1": 0.8814815282821655, + "num_tokens": 4649237.0, + "repeat_count": 2.0, + "routers_loss": 0.05698608607053757, + "skip_count": 4.0, + "step": 2882, + "text_loss": 0.10695219784975052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0008700498048426269, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4652362.0, + "repeat_count": 0.0, + "routers_loss": 0.0011786938412114978, + "skip_count": 0.0, + "step": 2884, + "text_loss": 0.4442957937717438 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.549457000293513, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008698415849433229, + "loss": 0.0092, + "macro_f1": 0.5492662787437439, + "num_tokens": 4655616.0, + "repeat_count": 2.0, + "routers_loss": 0.02142646163702011, + "skip_count": 0.0, + "step": 2886, + "text_loss": 0.5820964574813843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008696332233200262, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 4659294.0, + "repeat_count": 0.0, + "routers_loss": 0.004038636106997728, + "skip_count": 0.0, + "step": 2888, + "text_loss": 0.11847645789384842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008694247200525806, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 4662512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013256469974294305, + "skip_count": 0.0, + "step": 2890, + "text_loss": 0.4873582720756531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.577634282359847, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008692160752208856, + "loss": 0.0129, + "macro_f1": 0.3272727429866791, + "num_tokens": 4666190.0, + "repeat_count": 0.0, + "routers_loss": 0.04477972164750099, + "skip_count": 1.0, + "step": 2892, + "text_loss": 0.44243401288986206 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.09521484375, + "learning_rate": 0.0008690072889048941, + "loss": 0.0127, + "macro_f1": 1.0, + "num_tokens": 4668884.0, + "repeat_count": 1.0, + "routers_loss": 0.004407547414302826, + "skip_count": 2.0, + "step": 2894, + "text_loss": 0.6847127079963684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008687983611846133, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4672093.0, + "repeat_count": 0.0, + "routers_loss": 0.005245382897555828, + "skip_count": 1.0, + "step": 2896, + "text_loss": 0.25583332777023315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008685892921401049, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4674917.0, + "repeat_count": 0.0, + "routers_loss": 0.0010470855049788952, + "skip_count": 0.0, + "step": 2898, + "text_loss": 0.41998377442359924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008683800818514844, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 4677739.0, + "repeat_count": 0.0, + "routers_loss": 0.009026622399687767, + "skip_count": 2.0, + "step": 2900, + "text_loss": 0.303053081035614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008681707303989215, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 4680721.0, + "repeat_count": 0.0, + "routers_loss": 0.004500916693359613, + "skip_count": 0.0, + "step": 2902, + "text_loss": 0.5573288798332214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0008679612378626404, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 4683339.0, + "repeat_count": 0.0, + "routers_loss": 0.005047840531915426, + "skip_count": 1.0, + "step": 2904, + "text_loss": 0.321353554725647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.643381273847961, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008677516043229187, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 4686453.0, + "repeat_count": 0.0, + "routers_loss": 0.010256914421916008, + "skip_count": 1.0, + "step": 2906, + "text_loss": 0.4300784468650818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008675418298600883, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 4689645.0, + "repeat_count": 1.0, + "routers_loss": 0.0022669637110084295, + "skip_count": 0.0, + "step": 2908, + "text_loss": 0.5064885020256042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008673319145545358, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 4692320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011188550852239132, + "skip_count": 0.0, + "step": 2910, + "text_loss": 0.7114819884300232 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008671218584867003, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 4695116.0, + "repeat_count": 0.0, + "routers_loss": 0.002966561820358038, + "skip_count": 2.0, + "step": 2912, + "text_loss": 0.5662392973899841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008669116617370762, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 4698040.0, + "repeat_count": 0.0, + "routers_loss": 0.0012894890969619155, + "skip_count": 0.0, + "step": 2914, + "text_loss": 0.718977689743042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1552734375, + "learning_rate": 0.0008667013243862111, + "loss": 0.0162, + "macro_f1": 0.3333333432674408, + "num_tokens": 4700963.0, + "repeat_count": 0.0, + "routers_loss": 0.0007232456118799746, + "skip_count": 0.0, + "step": 2916, + "text_loss": 0.3447718024253845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.699735837980628, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000866490846514707, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 4704471.0, + "repeat_count": 1.0, + "routers_loss": 0.015166680328547955, + "skip_count": 0.0, + "step": 2918, + "text_loss": 0.454946368932724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000866280228203219, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 4707238.0, + "repeat_count": 1.0, + "routers_loss": 0.0061312485486269, + "skip_count": 1.0, + "step": 2920, + "text_loss": 0.721788227558136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008660694695324564, + "loss": 0.0125, + "macro_f1": 0.3333333432674408, + "num_tokens": 4711323.0, + "repeat_count": 0.0, + "routers_loss": 0.00169933564029634, + "skip_count": 0.0, + "step": 2922, + "text_loss": 0.7562121748924255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.727913120046962, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008658585705831829, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 4714417.0, + "repeat_count": 0.0, + "routers_loss": 0.0022731393110007048, + "skip_count": 0.0, + "step": 2924, + "text_loss": 0.5726147890090942 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.737305547402407, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.068359375, + "learning_rate": 0.0008656475314362148, + "loss": 0.0131, + "macro_f1": 0.8817967176437378, + "num_tokens": 4717445.0, + "repeat_count": 2.0, + "routers_loss": 0.06477782875299454, + "skip_count": 3.0, + "step": 2926, + "text_loss": 0.4505867660045624 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 13.74669797475785, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008654363521724229, + "loss": 0.0129, + "macro_f1": 0.9449735879898071, + "num_tokens": 4722253.0, + "repeat_count": 2.0, + "routers_loss": 0.027405790984630585, + "skip_count": 4.0, + "step": 2928, + "text_loss": 0.24767601490020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0008652250328727315, + "loss": 0.0112, + "macro_f1": 0.6666666865348816, + "num_tokens": 4725465.0, + "repeat_count": 0.0, + "routers_loss": 0.006544729229062796, + "skip_count": 2.0, + "step": 2930, + "text_loss": 0.4478724002838135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 13.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008650135736181184, + "loss": 0.0134, + "macro_f1": 0.6666666865348816, + "num_tokens": 4729213.0, + "repeat_count": 1.0, + "routers_loss": 0.0055119614116847515, + "skip_count": 0.0, + "step": 2932, + "text_loss": 0.6749323010444641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008648019744896154, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 4732280.0, + "repeat_count": 0.0, + "routers_loss": 0.008374541997909546, + "skip_count": 0.0, + "step": 2934, + "text_loss": 0.4647359251976013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.78426768417963, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008645902355683077, + "loss": 0.0091, + "macro_f1": 0.6595745086669922, + "num_tokens": 4736244.0, + "repeat_count": 1.0, + "routers_loss": 0.068686343729496, + "skip_count": 4.0, + "step": 2936, + "text_loss": 0.5356017351150513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008643783569353339, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4739810.0, + "repeat_count": 2.0, + "routers_loss": 0.017954571172595024, + "skip_count": 0.0, + "step": 2938, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008641663386718863, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4742720.0, + "repeat_count": 0.0, + "routers_loss": 0.006261351052671671, + "skip_count": 1.0, + "step": 2940, + "text_loss": 0.3200613856315613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008639541808592109, + "loss": 0.0093, + "macro_f1": 1.0, + "num_tokens": 4745870.0, + "repeat_count": 1.0, + "routers_loss": 0.0025341357104480267, + "skip_count": 1.0, + "step": 2942, + "text_loss": 0.5020416378974915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008637418835786067, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4748943.0, + "repeat_count": 0.0, + "routers_loss": 0.008970048278570175, + "skip_count": 2.0, + "step": 2944, + "text_loss": 0.14517110586166382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008635294469114265, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 4751360.0, + "repeat_count": 0.0, + "routers_loss": 0.002133632078766823, + "skip_count": 0.0, + "step": 2946, + "text_loss": 0.5367856025695801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08837890625, + "learning_rate": 0.0008633168709390766, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 4754403.0, + "repeat_count": 0.0, + "routers_loss": 0.0011866620043292642, + "skip_count": 0.0, + "step": 2948, + "text_loss": 0.38302522897720337 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 13.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008631041557430163, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 4757867.0, + "repeat_count": 2.0, + "routers_loss": 0.0026854004245251417, + "skip_count": 0.0, + "step": 2950, + "text_loss": 0.43433454632759094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0008628913014047585, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 4761171.0, + "repeat_count": 0.0, + "routers_loss": 0.002433479530736804, + "skip_count": 0.0, + "step": 2952, + "text_loss": 0.4725971519947052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.868799530378633, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008626783080058696, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 4764752.0, + "repeat_count": 1.0, + "routers_loss": 0.017182493582367897, + "skip_count": 0.0, + "step": 2954, + "text_loss": 0.460641473531723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.12353515625, + "learning_rate": 0.0008624651756279687, + "loss": 0.0198, + "macro_f1": 0.3333333432674408, + "num_tokens": 4767453.0, + "repeat_count": 0.0, + "routers_loss": 0.0018134774873033166, + "skip_count": 0.0, + "step": 2956, + "text_loss": 0.4091459810733795 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 13.887584385089522, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.000862251904352729, + "loss": 0.0108, + "macro_f1": 0.9259259104728699, + "num_tokens": 4771110.0, + "repeat_count": 3.0, + "routers_loss": 0.0365753099322319, + "skip_count": 3.0, + "step": 2958, + "text_loss": 0.22408585250377655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.896976812444967, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.000862038494261876, + "loss": 0.0109, + "macro_f1": 0.3272727429866791, + "num_tokens": 4774464.0, + "repeat_count": 0.0, + "routers_loss": 0.024343067780137062, + "skip_count": 1.0, + "step": 2960, + "text_loss": 0.16483014822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008618249454371891, + "loss": 0.01, + "macro_f1": 0.3333333432674408, + "num_tokens": 4777894.0, + "repeat_count": 0.0, + "routers_loss": 0.0008310087723657489, + "skip_count": 0.0, + "step": 2962, + "text_loss": 0.5573428869247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008616112579605006, + "loss": 0.0117, + "macro_f1": 0.3333333432674408, + "num_tokens": 4781116.0, + "repeat_count": 0.0, + "routers_loss": 0.0065494864247739315, + "skip_count": 0.0, + "step": 2964, + "text_loss": 0.18816794455051422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008613974319136957, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4784886.0, + "repeat_count": 0.0, + "routers_loss": 0.0019726944155991077, + "skip_count": 0.0, + "step": 2966, + "text_loss": 0.5097305774688721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.0008611834673787134, + "loss": 0.0118, + "macro_f1": 0.3333333432674408, + "num_tokens": 4787563.0, + "repeat_count": 0.0, + "routers_loss": 0.006327496841549873, + "skip_count": 0.0, + "step": 2968, + "text_loss": 0.6953814029693604 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 13.94393894922219, + "f1_execute": 0.9600000381469727, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008609693644375449, + "loss": 0.0086, + "macro_f1": 0.8200000524520874, + "num_tokens": 4790421.0, + "repeat_count": 3.0, + "routers_loss": 0.042896661907434464, + "skip_count": 1.0, + "step": 2970, + "text_loss": 0.2573051154613495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 13.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.14453125, + "learning_rate": 0.000860755123172235, + "loss": 0.0096, + "macro_f1": 1.0, + "num_tokens": 4793786.0, + "repeat_count": 2.0, + "routers_loss": 0.013228793628513813, + "skip_count": 1.0, + "step": 2972, + "text_loss": 0.46614497900009155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 13.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008605407436648815, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4796864.0, + "repeat_count": 0.0, + "routers_loss": 0.007294759154319763, + "skip_count": 2.0, + "step": 2974, + "text_loss": 0.21555091440677643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 13.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008603262259976348, + "loss": 0.0129, + "macro_f1": 1.0, + "num_tokens": 4800080.0, + "repeat_count": 1.0, + "routers_loss": 0.0024024227168411016, + "skip_count": 5.0, + "step": 2976, + "text_loss": 0.7855485081672668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008601115702526987, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4802899.0, + "repeat_count": 0.0, + "routers_loss": 0.001433031284250319, + "skip_count": 0.0, + "step": 2978, + "text_loss": 0.6777765154838562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 13.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008598967765123293, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 4805835.0, + "repeat_count": 0.0, + "routers_loss": 0.003073975909501314, + "skip_count": 0.0, + "step": 2980, + "text_loss": 0.5926910638809204 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 26.0, + "epoch": 14.0, + "f1_execute": 0.9333333373069763, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008596818448588364, + "loss": 0.0139, + "macro_f1": 0.8666667342185974, + "num_tokens": 4809028.0, + "repeat_count": 1.0, + "routers_loss": 0.06438573449850082, + "skip_count": 6.0, + "step": 2982, + "text_loss": 0.23975612223148346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.009392427355445, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0008594667753745821, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 4812831.0, + "repeat_count": 0.0, + "routers_loss": 0.014817612245678902, + "skip_count": 1.0, + "step": 2984, + "text_loss": 0.17292268574237823 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.018784854710889, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.07421875, + "learning_rate": 0.0008592515681419813, + "loss": 0.0078, + "macro_f1": 0.5492662787437439, + "num_tokens": 4816005.0, + "repeat_count": 2.0, + "routers_loss": 0.025407327339053154, + "skip_count": 0.0, + "step": 2986, + "text_loss": 0.6403061151504517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0008590362232435018, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 4818901.0, + "repeat_count": 0.0, + "routers_loss": 0.006826757453382015, + "skip_count": 0.0, + "step": 2988, + "text_loss": 0.2572069466114044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008588207407616644, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 4823120.0, + "repeat_count": 0.0, + "routers_loss": 0.0009054148104041815, + "skip_count": 0.0, + "step": 2990, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.046962136777223, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0008586051207790422, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 4825774.0, + "repeat_count": 0.0, + "routers_loss": 0.0012294676853343844, + "skip_count": 0.0, + "step": 2992, + "text_loss": 0.40157821774482727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.056354564132668, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052734375, + "learning_rate": 0.0008583893633782612, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 4828841.0, + "repeat_count": 0.0, + "routers_loss": 0.011474622413516045, + "skip_count": 2.0, + "step": 2994, + "text_loss": 0.14842072129249573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008581734686419999, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4831458.0, + "repeat_count": 0.0, + "routers_loss": 0.009154081344604492, + "skip_count": 2.0, + "step": 2996, + "text_loss": 0.365400105714798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.075139418843557, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00085795743665299, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4834609.0, + "repeat_count": 0.0, + "routers_loss": 0.002899336162954569, + "skip_count": 0.0, + "step": 2998, + "text_loss": 0.5574684143066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008577412674940152, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 4838324.0, + "repeat_count": 0.0, + "routers_loss": 0.0034664268605411053, + "skip_count": 0.0, + "step": 3000, + "text_loss": 0.6752855777740479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008575249612479117, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 4841877.0, + "repeat_count": 0.0, + "routers_loss": 0.0036425739526748657, + "skip_count": 2.0, + "step": 3002, + "text_loss": 0.6332980394363403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.103316700909891, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008573085179975685, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 4845840.0, + "repeat_count": 0.0, + "routers_loss": 0.0013783496106043458, + "skip_count": 0.0, + "step": 3004, + "text_loss": 0.4219617545604706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008570919378259274, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 4848766.0, + "repeat_count": 0.0, + "routers_loss": 0.004823608323931694, + "skip_count": 1.0, + "step": 3006, + "text_loss": 0.7987180948257446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000856875220815982, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4852310.0, + "repeat_count": 0.0, + "routers_loss": 0.0014760984340682626, + "skip_count": 0.0, + "step": 3008, + "text_loss": 0.35592713952064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.131493982976226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008566583670507788, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4856146.0, + "repeat_count": 0.0, + "routers_loss": 0.0031717263627797365, + "skip_count": 1.0, + "step": 3010, + "text_loss": 0.19379083812236786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.140886410331671, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008564413766134164, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 4859386.0, + "repeat_count": 0.0, + "routers_loss": 0.003361492184922099, + "skip_count": 0.0, + "step": 3012, + "text_loss": 0.39129266142845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008562242495870463, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4862661.0, + "repeat_count": 0.0, + "routers_loss": 0.0010563990799710155, + "skip_count": 0.0, + "step": 3014, + "text_loss": 0.5966938734054565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0008560069860548716, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4865410.0, + "repeat_count": 0.0, + "routers_loss": 0.001233913702890277, + "skip_count": 0.0, + "step": 3016, + "text_loss": 0.3386077880859375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.169063692398003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008557895861001484, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 4868931.0, + "repeat_count": 0.0, + "routers_loss": 0.0018066301709041, + "skip_count": 0.0, + "step": 3018, + "text_loss": 0.5222050547599792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.178456119753449, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008555720498061845, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 4873492.0, + "repeat_count": 0.0, + "routers_loss": 0.0050385501235723495, + "skip_count": 1.0, + "step": 3020, + "text_loss": 0.4558849334716797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.187848547108894, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008553543772563403, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 4877026.0, + "repeat_count": 0.0, + "routers_loss": 0.004828717093914747, + "skip_count": 0.0, + "step": 3022, + "text_loss": 0.36598992347717285 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 14.197240974464338, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008551365685340285, + "loss": 0.0084, + "macro_f1": 0.9555556178092957, + "num_tokens": 4879655.0, + "repeat_count": 1.0, + "routers_loss": 0.02049369551241398, + "skip_count": 5.0, + "step": 3024, + "text_loss": 0.5069093704223633 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.206633401819783, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008549186237227138, + "loss": 0.0088, + "macro_f1": 0.8823530077934265, + "num_tokens": 4882606.0, + "repeat_count": 1.0, + "routers_loss": 0.03947242721915245, + "skip_count": 2.0, + "step": 3026, + "text_loss": 0.2600715458393097 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 14.216025829175228, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0008547005429059128, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 4885246.0, + "repeat_count": 2.0, + "routers_loss": 0.0026363315992057323, + "skip_count": 0.0, + "step": 3028, + "text_loss": 0.37642326951026917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.225418256530672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0008544823261671948, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 4888109.0, + "repeat_count": 0.0, + "routers_loss": 0.003858231008052826, + "skip_count": 0.0, + "step": 3030, + "text_loss": 0.5875385999679565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008542639735901804, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 4891168.0, + "repeat_count": 1.0, + "routers_loss": 0.004789089784026146, + "skip_count": 1.0, + "step": 3032, + "text_loss": 0.6417325139045715 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.244203111241562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008540454852585434, + "loss": 0.0115, + "macro_f1": 0.6666666865348816, + "num_tokens": 4894355.0, + "repeat_count": 0.0, + "routers_loss": 0.007334680762141943, + "skip_count": 2.0, + "step": 3034, + "text_loss": 0.23697198927402496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 27.0, + "epoch": 14.253595538597006, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.5, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008538268612560084, + "loss": 0.0058, + "macro_f1": 0.4871794879436493, + "num_tokens": 4897543.0, + "repeat_count": 0.0, + "routers_loss": 0.022096361964941025, + "skip_count": 3.0, + "step": 3036, + "text_loss": 0.1989550143480301 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.262987965952451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008536081016663527, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4900752.0, + "repeat_count": 1.0, + "routers_loss": 0.0037680594250559807, + "skip_count": 2.0, + "step": 3038, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008533892065734055, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 4903581.0, + "repeat_count": 0.0, + "routers_loss": 0.0032373068388551474, + "skip_count": 1.0, + "step": 3040, + "text_loss": 0.5019411444664001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008531701760610476, + "loss": 0.0121, + "macro_f1": 1.0, + "num_tokens": 4907108.0, + "repeat_count": 1.0, + "routers_loss": 0.0078013185411691666, + "skip_count": 2.0, + "step": 3042, + "text_loss": 0.3460627794265747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.3333333432674408, + "avg_layers": 28.0, + "epoch": 14.291165248018785, + "f1_execute": 0.9600000381469727, + "f1_repeat": 1.0, + "f1_skip": 0.5, + "grad_norm": 0.04833984375, + "learning_rate": 0.000852951010213212, + "loss": 0.0089, + "macro_f1": 0.8200000524520874, + "num_tokens": 4911269.0, + "repeat_count": 1.0, + "routers_loss": 0.03576689213514328, + "skip_count": 3.0, + "step": 3044, + "text_loss": 0.268994003534317 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 14.300557675374229, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0008527317091138835, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 4914203.0, + "repeat_count": 1.0, + "routers_loss": 0.0032140621915459633, + "skip_count": 1.0, + "step": 3046, + "text_loss": 0.9998719692230225 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.309950102729674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008525122728470987, + "loss": 0.0102, + "macro_f1": 1.0, + "num_tokens": 4918562.0, + "repeat_count": 1.0, + "routers_loss": 0.008559177629649639, + "skip_count": 3.0, + "step": 3048, + "text_loss": 0.3062439560890198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0008522927014969459, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 4921940.0, + "repeat_count": 0.0, + "routers_loss": 0.008735597133636475, + "skip_count": 2.0, + "step": 3050, + "text_loss": 0.3637430965900421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008520729951475652, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 4925416.0, + "repeat_count": 0.0, + "routers_loss": 0.0012709591537714005, + "skip_count": 0.0, + "step": 3052, + "text_loss": 0.542036235332489 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.338127384796008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008518531538831488, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4928695.0, + "repeat_count": 0.0, + "routers_loss": 0.0010660928674042225, + "skip_count": 1.0, + "step": 3054, + "text_loss": 0.43144503235816956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.00085163317778794, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 4931504.0, + "repeat_count": 0.0, + "routers_loss": 0.004558971151709557, + "skip_count": 2.0, + "step": 3056, + "text_loss": 0.5257010459899902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008514130669462341, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 4934935.0, + "repeat_count": 0.0, + "routers_loss": 0.010774781927466393, + "skip_count": 2.0, + "step": 3058, + "text_loss": 0.26061776280403137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.366304666862343, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008511928214423782, + "loss": 0.0103, + "macro_f1": 0.6601307392120361, + "num_tokens": 4938047.0, + "repeat_count": 1.0, + "routers_loss": 0.014763157814741135, + "skip_count": 2.0, + "step": 3060, + "text_loss": 0.2856905460357666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.375697094217786, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509724413607705, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 4941041.0, + "repeat_count": 1.0, + "routers_loss": 0.004613345488905907, + "skip_count": 0.0, + "step": 3062, + "text_loss": 0.2870287001132965 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.385089521573232, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008507519267858612, + "loss": 0.015, + "macro_f1": 1.0, + "num_tokens": 4944708.0, + "repeat_count": 1.0, + "routers_loss": 0.008584189228713512, + "skip_count": 2.0, + "step": 3064, + "text_loss": 0.15828095376491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.394481948928677, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0008505312778021519, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 4948295.0, + "repeat_count": 0.0, + "routers_loss": 0.0014670816017314792, + "skip_count": 0.0, + "step": 3066, + "text_loss": 0.36697930097579956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0008503104944941958, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 4951983.0, + "repeat_count": 0.0, + "routers_loss": 0.005348859820514917, + "skip_count": 2.0, + "step": 3068, + "text_loss": 0.21612997353076935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008500895769465972, + "loss": 0.0111, + "macro_f1": 0.3333333432674408, + "num_tokens": 4955023.0, + "repeat_count": 0.0, + "routers_loss": 0.0013203793205320835, + "skip_count": 0.0, + "step": 3070, + "text_loss": 0.9757798314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.422659230995011, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008498685252440124, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 4957600.0, + "repeat_count": 0.0, + "routers_loss": 0.006907356437295675, + "skip_count": 0.0, + "step": 3072, + "text_loss": 0.356107234954834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.432051658350455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008496473394711487, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 4960746.0, + "repeat_count": 0.0, + "routers_loss": 0.0027704904787242413, + "skip_count": 1.0, + "step": 3074, + "text_loss": 0.6812908053398132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008494260197127649, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 4963845.0, + "repeat_count": 0.0, + "routers_loss": 0.0036796489730477333, + "skip_count": 2.0, + "step": 3076, + "text_loss": 0.7215370535850525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008492045660536712, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 4966887.0, + "repeat_count": 0.0, + "routers_loss": 0.0037137691397219896, + "skip_count": 1.0, + "step": 3078, + "text_loss": 0.8700299859046936 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 14.460228940416789, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008489829785787291, + "loss": 0.0078, + "macro_f1": 0.8823530077934265, + "num_tokens": 4969859.0, + "repeat_count": 1.0, + "routers_loss": 0.016492314636707306, + "skip_count": 2.0, + "step": 3080, + "text_loss": 0.6520360112190247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008487612573728513, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 4972628.0, + "repeat_count": 0.0, + "routers_loss": 0.004022917244583368, + "skip_count": 2.0, + "step": 3082, + "text_loss": 0.17498187720775604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008485394025210016, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4975475.0, + "repeat_count": 0.0, + "routers_loss": 0.009141159243881702, + "skip_count": 1.0, + "step": 3084, + "text_loss": 0.5975366234779358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.488406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008483174141081956, + "loss": 0.0113, + "macro_f1": 0.3333333432674408, + "num_tokens": 4978858.0, + "repeat_count": 0.0, + "routers_loss": 0.0031561285723000765, + "skip_count": 0.0, + "step": 3086, + "text_loss": 0.18748866021633148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.497798649838568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008480952922194991, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 4982142.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894713780842721, + "skip_count": 0.0, + "step": 3088, + "text_loss": 0.42083197832107544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008478730369400302, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 4984872.0, + "repeat_count": 0.0, + "routers_loss": 0.0005908289458602667, + "skip_count": 0.0, + "step": 3090, + "text_loss": 0.45337188243865967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.516583504549457, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0008476506483549573, + "loss": 0.0101, + "macro_f1": 1.0, + "num_tokens": 4988137.0, + "repeat_count": 1.0, + "routers_loss": 0.0016509373672306538, + "skip_count": 2.0, + "step": 3092, + "text_loss": 0.6397262811660767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008474281265495002, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 4991164.0, + "repeat_count": 0.0, + "routers_loss": 0.004088304936885834, + "skip_count": 1.0, + "step": 3094, + "text_loss": 0.18352322280406952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008472054716089295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 4993876.0, + "repeat_count": 0.0, + "routers_loss": 0.005200014915317297, + "skip_count": 0.0, + "step": 3096, + "text_loss": 0.2776511013507843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.544760786615791, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008469826836185673, + "loss": 0.01, + "macro_f1": 0.6601307392120361, + "num_tokens": 4997068.0, + "repeat_count": 1.0, + "routers_loss": 0.012686059810221195, + "skip_count": 2.0, + "step": 3098, + "text_loss": 0.23209233582019806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008467597626637858, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 5000038.0, + "repeat_count": 1.0, + "routers_loss": 0.006401528604328632, + "skip_count": 2.0, + "step": 3100, + "text_loss": 0.45936745405197144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008465367088300093, + "loss": 0.0075, + "macro_f1": 0.3272727429866791, + "num_tokens": 5002870.0, + "repeat_count": 0.0, + "routers_loss": 0.016640547662973404, + "skip_count": 1.0, + "step": 3102, + "text_loss": 0.44502779841423035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.572938068682125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008463135222027124, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5006357.0, + "repeat_count": 0.0, + "routers_loss": 0.008411331102252007, + "skip_count": 2.0, + "step": 3104, + "text_loss": 0.3414570391178131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.582330496037569, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008460902028674204, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5009059.0, + "repeat_count": 0.0, + "routers_loss": 0.0010406570509076118, + "skip_count": 0.0, + "step": 3106, + "text_loss": 0.5931221842765808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0008458667509097098, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5012327.0, + "repeat_count": 0.0, + "routers_loss": 0.001959054498001933, + "skip_count": 0.0, + "step": 3108, + "text_loss": 0.5191171169281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.0008456431664152078, + "loss": 0.0127, + "macro_f1": 0.3333333432674408, + "num_tokens": 5015472.0, + "repeat_count": 0.0, + "routers_loss": 0.000994380097836256, + "skip_count": 0.0, + "step": 3110, + "text_loss": 0.4455361068248749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.610507778103903, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0008454194494695923, + "loss": 0.0109, + "macro_f1": 0.3333333432674408, + "num_tokens": 5018901.0, + "repeat_count": 0.0, + "routers_loss": 0.0037662344984710217, + "skip_count": 0.0, + "step": 3112, + "text_loss": 0.5335362553596497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.619900205459349, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0008451956001585923, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5022520.0, + "repeat_count": 0.0, + "routers_loss": 0.008664715103805065, + "skip_count": 3.0, + "step": 3114, + "text_loss": 0.16230148077011108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.629292632814794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.000844971618567987, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 5025505.0, + "repeat_count": 0.0, + "routers_loss": 0.0015904927859082818, + "skip_count": 0.0, + "step": 3116, + "text_loss": 0.6989432573318481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.638685060170237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008447475047836068, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 5028767.0, + "repeat_count": 0.0, + "routers_loss": 0.005853322334587574, + "skip_count": 1.0, + "step": 3118, + "text_loss": 0.31420737504959106 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 14.648077487525683, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008445232588913325, + "loss": 0.0115, + "macro_f1": 0.3272727429866791, + "num_tokens": 5032577.0, + "repeat_count": 0.0, + "routers_loss": 0.012760105542838573, + "skip_count": 0.0, + "step": 3120, + "text_loss": 0.5534627437591553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008442988809770953, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 5035381.0, + "repeat_count": 0.0, + "routers_loss": 0.0022257440723478794, + "skip_count": 0.0, + "step": 3122, + "text_loss": 0.42492759227752686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.666862342236572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008440743711268775, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5038743.0, + "repeat_count": 0.0, + "routers_loss": 0.004648433532565832, + "skip_count": 0.0, + "step": 3124, + "text_loss": 0.16404685378074646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008438497294267117, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5041492.0, + "repeat_count": 0.0, + "routers_loss": 0.006313877180218697, + "skip_count": 0.0, + "step": 3126, + "text_loss": 0.23191484808921814 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07666015625, + "learning_rate": 0.0008436249559626807, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 5043955.0, + "repeat_count": 1.0, + "routers_loss": 0.0036270488053560257, + "skip_count": 0.0, + "step": 3128, + "text_loss": 0.5782018303871155 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.695039624302906, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008434000508209187, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5047571.0, + "repeat_count": 0.0, + "routers_loss": 0.003809858812019229, + "skip_count": 1.0, + "step": 3130, + "text_loss": 0.7129825949668884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.704432051658351, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0008431750140876092, + "loss": 0.0128, + "macro_f1": 0.3333333432674408, + "num_tokens": 5051608.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369057405740023, + "skip_count": 0.0, + "step": 3132, + "text_loss": 0.4433445930480957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.713824479013795, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.000842949845848987, + "loss": 0.0135, + "macro_f1": 0.32098764181137085, + "num_tokens": 5054656.0, + "repeat_count": 0.0, + "routers_loss": 0.0425117202103138, + "skip_count": 2.0, + "step": 3134, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0008427245461913368, + "loss": 0.0121, + "macro_f1": 0.3333333432674408, + "num_tokens": 5059108.0, + "repeat_count": 0.0, + "routers_loss": 0.0018077283166348934, + "skip_count": 0.0, + "step": 3136, + "text_loss": 0.7496368885040283 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.12109375, + "learning_rate": 0.0008424991152009941, + "loss": 0.0111, + "macro_f1": 1.0, + "num_tokens": 5062371.0, + "repeat_count": 1.0, + "routers_loss": 0.008801834657788277, + "skip_count": 2.0, + "step": 3138, + "text_loss": 0.5337086319923401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 14.742001761080129, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0008422735529643444, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5065593.0, + "repeat_count": 0.0, + "routers_loss": 0.00548676960170269, + "skip_count": 3.0, + "step": 3140, + "text_loss": 0.2561623156070709 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.751394188435574, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008420478595678233, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5068271.0, + "repeat_count": 0.0, + "routers_loss": 0.006389956455677748, + "skip_count": 0.0, + "step": 3142, + "text_loss": 0.15605193376541138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.760786615791018, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008418220350979175, + "loss": 0.0128, + "macro_f1": 1.0, + "num_tokens": 5071358.0, + "repeat_count": 1.0, + "routers_loss": 0.012387622147798538, + "skip_count": 2.0, + "step": 3144, + "text_loss": 0.3085838258266449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008415960796411628, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5075584.0, + "repeat_count": 0.0, + "routers_loss": 0.00311864772811532, + "skip_count": 1.0, + "step": 3146, + "text_loss": 0.4786977469921112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.779571470501908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1591796875, + "learning_rate": 0.0008413699932841461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5078388.0, + "repeat_count": 0.0, + "routers_loss": 0.0030679800547659397, + "skip_count": 0.0, + "step": 3148, + "text_loss": 0.5222916603088379 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.788963897857352, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008411437761135039, + "loss": 0.011, + "macro_f1": 1.0, + "num_tokens": 5081584.0, + "repeat_count": 1.0, + "routers_loss": 0.012907958589494228, + "skip_count": 2.0, + "step": 3150, + "text_loss": 0.5369884371757507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008409174282159232, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5084450.0, + "repeat_count": 0.0, + "routers_loss": 0.012314042076468468, + "skip_count": 2.0, + "step": 3152, + "text_loss": 0.25685277581214905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.807748752568243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000840690949678141, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5087865.0, + "repeat_count": 1.0, + "routers_loss": 0.00899206381291151, + "skip_count": 0.0, + "step": 3154, + "text_loss": 0.1717093288898468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.817141179923686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0008404643405869441, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5090857.0, + "repeat_count": 0.0, + "routers_loss": 0.0013312003575265408, + "skip_count": 0.0, + "step": 3156, + "text_loss": 0.27446436882019043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.826533607279131, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1533203125, + "learning_rate": 0.0008402376010291695, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 5093917.0, + "repeat_count": 0.0, + "routers_loss": 0.002653320087119937, + "skip_count": 0.0, + "step": 3158, + "text_loss": 0.4237489402294159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008400107310917045, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5096656.0, + "repeat_count": 0.0, + "routers_loss": 0.012976993806660175, + "skip_count": 2.0, + "step": 3160, + "text_loss": 0.42361980676651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.000839783730861486, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5099582.0, + "repeat_count": 0.0, + "routers_loss": 0.006936746649444103, + "skip_count": 2.0, + "step": 3162, + "text_loss": 0.26656073331832886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008395566004255008, + "loss": 0.0127, + "macro_f1": 0.6666666865348816, + "num_tokens": 5102908.0, + "repeat_count": 0.0, + "routers_loss": 0.006619359832257032, + "skip_count": 1.0, + "step": 3164, + "text_loss": 0.590774416923523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008393293398707858, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5105829.0, + "repeat_count": 0.0, + "routers_loss": 0.010120268911123276, + "skip_count": 2.0, + "step": 3166, + "text_loss": 0.605930507183075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.873495744056354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008391019492844275, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5109850.0, + "repeat_count": 0.0, + "routers_loss": 0.004940980114042759, + "skip_count": 2.0, + "step": 3168, + "text_loss": 0.12973152101039886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008388744287535627, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5113353.0, + "repeat_count": 0.0, + "routers_loss": 0.0031777634285390377, + "skip_count": 1.0, + "step": 3170, + "text_loss": 0.18577200174331665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0008386467783653775, + "loss": 0.0103, + "macro_f1": 0.3333333432674408, + "num_tokens": 5116421.0, + "repeat_count": 0.0, + "routers_loss": 0.005431659985333681, + "skip_count": 0.0, + "step": 3172, + "text_loss": 0.2302747517824173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.901673026122689, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.000838418998207108, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5119457.0, + "repeat_count": 0.0, + "routers_loss": 0.0077286697924137115, + "skip_count": 4.0, + "step": 3174, + "text_loss": 0.19606637954711914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008381910883660399, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5123201.0, + "repeat_count": 0.0, + "routers_loss": 0.003982985392212868, + "skip_count": 0.0, + "step": 3176, + "text_loss": 0.716376006603241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 14.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.09423828125, + "learning_rate": 0.0008379630489295089, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5126035.0, + "repeat_count": 0.0, + "routers_loss": 0.005626026075333357, + "skip_count": 1.0, + "step": 3178, + "text_loss": 0.5144625902175903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.929850308189023, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008377348799849, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5129179.0, + "repeat_count": 0.0, + "routers_loss": 0.015458245761692524, + "skip_count": 2.0, + "step": 3180, + "text_loss": 0.29887503385543823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 14.939242735544468, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.062255859375, + "learning_rate": 0.0008375065816196479, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 5132149.0, + "repeat_count": 0.0, + "routers_loss": 0.012210468761622906, + "skip_count": 2.0, + "step": 3182, + "text_loss": 0.8981851935386658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.948635162899912, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008372781539212371, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5135287.0, + "repeat_count": 0.0, + "routers_loss": 0.0052537876181304455, + "skip_count": 0.0, + "step": 3184, + "text_loss": 0.4245666563510895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 14.958027590255357, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0008370495969772014, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5138589.0, + "repeat_count": 0.0, + "routers_loss": 0.012873421423137188, + "skip_count": 2.0, + "step": 3186, + "text_loss": 0.40581050515174866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 14.9674200176108, + "f1_execute": 0.95652174949646, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07470703125, + "learning_rate": 0.0008368209108751244, + "loss": 0.0127, + "macro_f1": 0.6521739363670349, + "num_tokens": 5141635.0, + "repeat_count": 2.0, + "routers_loss": 0.07720445841550827, + "skip_count": 4.0, + "step": 3188, + "text_loss": 0.3755173981189728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0008365920957026389, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5144728.0, + "repeat_count": 0.0, + "routers_loss": 0.001440995605662465, + "skip_count": 0.0, + "step": 3190, + "text_loss": 0.5067034363746643 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 14.986204872321691, + "f1_execute": 0.9615384340286255, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008363631515474275, + "loss": 0.0089, + "macro_f1": 0.6538461446762085, + "num_tokens": 5147963.0, + "repeat_count": 1.0, + "routers_loss": 0.018752984702587128, + "skip_count": 2.0, + "step": 3192, + "text_loss": 0.20224551856517792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 14.995597299677135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0008361340784972217, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5151184.0, + "repeat_count": 0.0, + "routers_loss": 0.0005360354552976787, + "skip_count": 0.0, + "step": 3194, + "text_loss": 0.4588058292865753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.004696213677722, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008359048766398031, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5153889.0, + "repeat_count": 0.0, + "routers_loss": 0.0009184491937048733, + "skip_count": 1.0, + "step": 3196, + "text_loss": 0.2980220317840576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.014088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000835675546063002, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5156758.0, + "repeat_count": 0.0, + "routers_loss": 0.001252970308996737, + "skip_count": 0.0, + "step": 3198, + "text_loss": 0.6775755882263184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008354460868546985, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5160247.0, + "repeat_count": 0.0, + "routers_loss": 0.0037315806839615107, + "skip_count": 0.0, + "step": 3200, + "text_loss": 0.35867011547088623 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008352164991028217, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 5163456.0, + "repeat_count": 1.0, + "routers_loss": 0.001497485558502376, + "skip_count": 0.0, + "step": 3202, + "text_loss": 0.690290093421936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.042265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008349867828953501, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 5166139.0, + "repeat_count": 0.0, + "routers_loss": 0.001051135826855898, + "skip_count": 0.0, + "step": 3204, + "text_loss": 0.3340415954589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.051658350454945, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0008347569383203113, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 5169009.0, + "repeat_count": 0.0, + "routers_loss": 0.0010544003453105688, + "skip_count": 0.0, + "step": 3206, + "text_loss": 0.8584878444671631 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008345269654657823, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 5172618.0, + "repeat_count": 1.0, + "routers_loss": 0.007312417030334473, + "skip_count": 1.0, + "step": 3208, + "text_loss": 0.19500218331813812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.070443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008342968644198892, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 5175857.0, + "repeat_count": 0.0, + "routers_loss": 0.00276504410430789, + "skip_count": 0.0, + "step": 3210, + "text_loss": 0.5446314215660095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.079835632521279, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0008340666352708068, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5178585.0, + "repeat_count": 0.0, + "routers_loss": 0.002669303445145488, + "skip_count": 0.0, + "step": 3212, + "text_loss": 0.3687484860420227 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0008338362781067596, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5181777.0, + "repeat_count": 0.0, + "routers_loss": 0.0031585274264216423, + "skip_count": 0.0, + "step": 3214, + "text_loss": 0.27325859665870667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000833605793016021, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 5184312.0, + "repeat_count": 0.0, + "routers_loss": 0.008807534351944923, + "skip_count": 2.0, + "step": 3216, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.108012914587613, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008333751800869133, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5187497.0, + "repeat_count": 0.0, + "routers_loss": 0.003171310294419527, + "skip_count": 0.0, + "step": 3218, + "text_loss": 0.5423526763916016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.117405341943059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0008331444394078076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5190982.0, + "repeat_count": 0.0, + "routers_loss": 0.0016481258207932115, + "skip_count": 2.0, + "step": 3220, + "text_loss": 0.48984917998313904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.126797769298504, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000832913571067124, + "loss": 0.0107, + "macro_f1": 1.0, + "num_tokens": 5194044.0, + "repeat_count": 1.0, + "routers_loss": 0.003957313951104879, + "skip_count": 1.0, + "step": 3222, + "text_loss": 0.4533331096172333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.136190196653947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008326825751533322, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5197092.0, + "repeat_count": 0.0, + "routers_loss": 0.0016904744552448392, + "skip_count": 0.0, + "step": 3224, + "text_loss": 0.5538802742958069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008324514517549501, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5199941.0, + "repeat_count": 0.0, + "routers_loss": 0.005608258303254843, + "skip_count": 1.0, + "step": 3226, + "text_loss": 0.416242778301239 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 15.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008322202009605444, + "loss": 0.0072, + "macro_f1": 0.8823530077934265, + "num_tokens": 5202618.0, + "repeat_count": 1.0, + "routers_loss": 0.020965175703167915, + "skip_count": 2.0, + "step": 3228, + "text_loss": 0.17496295273303986 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 15.164367478720282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008319888228587311, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5206414.0, + "repeat_count": 1.0, + "routers_loss": 0.021259209141135216, + "skip_count": 5.0, + "step": 3230, + "text_loss": 0.22471418976783752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008317573175381745, + "loss": 0.0115, + "macro_f1": 0.3333333432674408, + "num_tokens": 5209768.0, + "repeat_count": 0.0, + "routers_loss": 0.0018647604156285524, + "skip_count": 0.0, + "step": 3232, + "text_loss": 0.4415269196033478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008315256850875881, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5213257.0, + "repeat_count": 0.0, + "routers_loss": 0.002345515415072441, + "skip_count": 0.0, + "step": 3234, + "text_loss": 0.347247838973999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.192544760786616, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008312939255957336, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 5215800.0, + "repeat_count": 0.0, + "routers_loss": 0.007112892810255289, + "skip_count": 3.0, + "step": 3236, + "text_loss": 0.31091734766960144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.201937188142061, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008310620391514219, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5219205.0, + "repeat_count": 0.0, + "routers_loss": 0.00432228296995163, + "skip_count": 0.0, + "step": 3238, + "text_loss": 0.3421775996685028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0008308300258435124, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 5222422.0, + "repeat_count": 0.0, + "routers_loss": 0.0076514314860105515, + "skip_count": 2.0, + "step": 3240, + "text_loss": 0.22378318011760712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008305978857609128, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 5225625.0, + "repeat_count": 0.0, + "routers_loss": 0.0007617069641128182, + "skip_count": 0.0, + "step": 3242, + "text_loss": 0.5880323648452759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0008303656189925799, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5229113.0, + "repeat_count": 0.0, + "routers_loss": 0.0017418119823560119, + "skip_count": 0.0, + "step": 3244, + "text_loss": 0.3302813768386841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.239506897563839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008301332256275183, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5232061.0, + "repeat_count": 0.0, + "routers_loss": 0.0026667986530810595, + "skip_count": 0.0, + "step": 3246, + "text_loss": 0.5679706335067749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.248899324919284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0008299007057547821, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5235279.0, + "repeat_count": 1.0, + "routers_loss": 0.011016624979674816, + "skip_count": 2.0, + "step": 3248, + "text_loss": 0.5081504583358765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.258291752274728, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0008296680594634731, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5239655.0, + "repeat_count": 1.0, + "routers_loss": 0.005492044147104025, + "skip_count": 0.0, + "step": 3250, + "text_loss": 0.14675180613994598 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0008294352868427418, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5243579.0, + "repeat_count": 0.0, + "routers_loss": 0.00404445780441165, + "skip_count": 1.0, + "step": 3252, + "text_loss": 0.4201085865497589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.277076606985618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0008292023879817871, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 5247059.0, + "repeat_count": 0.0, + "routers_loss": 0.006886140909045935, + "skip_count": 1.0, + "step": 3254, + "text_loss": 0.2289208322763443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.286469034341062, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008289693629698564, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5249940.0, + "repeat_count": 0.0, + "routers_loss": 0.0005736657767556608, + "skip_count": 0.0, + "step": 3256, + "text_loss": 0.5670450925827026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.295861461696507, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0008287362118962452, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 5253580.0, + "repeat_count": 0.0, + "routers_loss": 0.011349895037710667, + "skip_count": 1.0, + "step": 3258, + "text_loss": 0.5042323470115662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.305253889051952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0008285029348502973, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5257080.0, + "repeat_count": 0.0, + "routers_loss": 0.0013626761501654983, + "skip_count": 0.0, + "step": 3260, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.314646316407396, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0008282695319214053, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5259951.0, + "repeat_count": 0.0, + "routers_loss": 0.00471635302528739, + "skip_count": 0.0, + "step": 3262, + "text_loss": 0.20773714780807495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.324038743762841, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008280360031990093, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 5263314.0, + "repeat_count": 0.0, + "routers_loss": 0.010472415015101433, + "skip_count": 2.0, + "step": 3264, + "text_loss": 0.34397366642951965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.333431171118287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000827802348772598, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 5267358.0, + "repeat_count": 0.0, + "routers_loss": 0.0007814752752892673, + "skip_count": 0.0, + "step": 3266, + "text_loss": 0.747342586517334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008275685687317084, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5270400.0, + "repeat_count": 0.0, + "routers_loss": 0.000902949133887887, + "skip_count": 0.0, + "step": 3268, + "text_loss": 0.43782034516334534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008273346631659252, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5273147.0, + "repeat_count": 0.0, + "routers_loss": 0.00043462219764478505, + "skip_count": 0.0, + "step": 3270, + "text_loss": 0.6358205080032349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.361608453184619, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008271006321648816, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5277638.0, + "repeat_count": 0.0, + "routers_loss": 0.002211218234151602, + "skip_count": 0.0, + "step": 3272, + "text_loss": 0.20220105350017548 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.371000880540064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008268664758182589, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5280638.0, + "repeat_count": 1.0, + "routers_loss": 0.010536720044910908, + "skip_count": 0.0, + "step": 3274, + "text_loss": 0.7579061388969421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008266321942157859, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5283847.0, + "repeat_count": 0.0, + "routers_loss": 0.0017158017726615071, + "skip_count": 0.0, + "step": 3276, + "text_loss": 0.669302761554718 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.389785735250953, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008263977874472399, + "loss": 0.0088, + "macro_f1": 0.9544159770011902, + "num_tokens": 5286627.0, + "repeat_count": 5.0, + "routers_loss": 0.011220700107514858, + "skip_count": 4.0, + "step": 3278, + "text_loss": 0.8703984022140503 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.399178162606399, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008261632556024461, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5289766.0, + "repeat_count": 0.0, + "routers_loss": 0.0020442772656679153, + "skip_count": 0.0, + "step": 3280, + "text_loss": 0.5009346008300781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10107421875, + "learning_rate": 0.0008259285987712774, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5293010.0, + "repeat_count": 0.0, + "routers_loss": 0.005645765457302332, + "skip_count": 0.0, + "step": 3282, + "text_loss": 0.2546011209487915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008256938170436549, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5296732.0, + "repeat_count": 0.0, + "routers_loss": 0.0027385836001485586, + "skip_count": 2.0, + "step": 3284, + "text_loss": 0.5244000554084778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.427355444672733, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008254589105095473, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 5299926.0, + "repeat_count": 1.0, + "routers_loss": 0.007451715879142284, + "skip_count": 1.0, + "step": 3286, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0008252238792589711, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5303006.0, + "repeat_count": 0.0, + "routers_loss": 0.004805843345820904, + "skip_count": 2.0, + "step": 3288, + "text_loss": 0.5131978392601013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.446140299383622, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000824988723381991, + "loss": 0.0091, + "macro_f1": 0.3272727429866791, + "num_tokens": 5306953.0, + "repeat_count": 0.0, + "routers_loss": 0.010639613494277, + "skip_count": 1.0, + "step": 3290, + "text_loss": 0.4901447296142578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 15.455532726739067, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008247534429687191, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 5310516.0, + "repeat_count": 0.0, + "routers_loss": 0.013625577092170715, + "skip_count": 2.0, + "step": 3292, + "text_loss": 0.2124534696340561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008245180381093152, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 5313959.0, + "repeat_count": 0.0, + "routers_loss": 0.004958513658493757, + "skip_count": 1.0, + "step": 3294, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008242825088939867, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5316609.0, + "repeat_count": 0.0, + "routers_loss": 0.003962756600230932, + "skip_count": 0.0, + "step": 3296, + "text_loss": 0.7010108232498169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.483710008805401, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0008240468554129892, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5319638.0, + "repeat_count": 0.0, + "routers_loss": 0.0006996620795689523, + "skip_count": 0.0, + "step": 3298, + "text_loss": 0.4966355860233307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.493102436160845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008238110777566255, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 5323019.0, + "repeat_count": 0.0, + "routers_loss": 0.0016031896229833364, + "skip_count": 0.0, + "step": 3300, + "text_loss": 0.38668957352638245 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008235751760152459, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 5326099.0, + "repeat_count": 2.0, + "routers_loss": 0.00344281829893589, + "skip_count": 2.0, + "step": 3302, + "text_loss": 0.5330720543861389 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.511887290871735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008233391502792484, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5328993.0, + "repeat_count": 0.0, + "routers_loss": 0.007886730134487152, + "skip_count": 1.0, + "step": 3304, + "text_loss": 0.5470269322395325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.521279718227179, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0008231030006390786, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5331554.0, + "repeat_count": 0.0, + "routers_loss": 0.008180000819265842, + "skip_count": 1.0, + "step": 3306, + "text_loss": 0.4023340344429016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0008228667271852294, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5335712.0, + "repeat_count": 0.0, + "routers_loss": 0.0002942821884062141, + "skip_count": 0.0, + "step": 3308, + "text_loss": 0.5306711792945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008226303300082414, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 5338701.0, + "repeat_count": 0.0, + "routers_loss": 0.0006134595023468137, + "skip_count": 0.0, + "step": 3310, + "text_loss": 0.5906263589859009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.549457000293513, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008223938091987022, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5342274.0, + "repeat_count": 0.0, + "routers_loss": 0.0016656654188409448, + "skip_count": 0.0, + "step": 3312, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.558849427648958, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008221571648472472, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5345185.0, + "repeat_count": 0.0, + "routers_loss": 0.0038612703792750835, + "skip_count": 0.0, + "step": 3314, + "text_loss": 0.36633720993995667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.568241855004402, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008219203970445589, + "loss": 0.011, + "macro_f1": 0.3272727429866791, + "num_tokens": 5348804.0, + "repeat_count": 0.0, + "routers_loss": 0.009782899171113968, + "skip_count": 1.0, + "step": 3316, + "text_loss": 0.3117460012435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.577634282359847, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008216835058813672, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 5351896.0, + "repeat_count": 0.0, + "routers_loss": 0.007713229861110449, + "skip_count": 0.0, + "step": 3318, + "text_loss": 0.253496378660202 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008214464914484492, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5355058.0, + "repeat_count": 0.0, + "routers_loss": 0.006227815989404917, + "skip_count": 2.0, + "step": 3320, + "text_loss": 0.32693132758140564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008212093538366292, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5358365.0, + "repeat_count": 0.0, + "routers_loss": 0.002601418411359191, + "skip_count": 0.0, + "step": 3322, + "text_loss": 0.40394455194473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 15.605811564426181, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000820972093136779, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5360981.0, + "repeat_count": 0.0, + "routers_loss": 0.005545300897210836, + "skip_count": 3.0, + "step": 3324, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0008207347094398172, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 5364018.0, + "repeat_count": 1.0, + "routers_loss": 0.001924700103700161, + "skip_count": 0.0, + "step": 3326, + "text_loss": 0.5196860432624817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0008204972028367097, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5366986.0, + "repeat_count": 0.0, + "routers_loss": 0.012254828587174416, + "skip_count": 1.0, + "step": 3328, + "text_loss": 0.24661913514137268 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.633988846492516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008202595734184694, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5371463.0, + "repeat_count": 0.0, + "routers_loss": 0.005094083491712809, + "skip_count": 0.0, + "step": 3330, + "text_loss": 0.2525769770145416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.643381273847961, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008200218212761566, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 5374823.0, + "repeat_count": 1.0, + "routers_loss": 0.0025883198250085115, + "skip_count": 0.0, + "step": 3332, + "text_loss": 0.21849912405014038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.000819783946500878, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5377640.0, + "repeat_count": 0.0, + "routers_loss": 0.008240507915616035, + "skip_count": 0.0, + "step": 3334, + "text_loss": 0.2662734091281891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 15.66216612855885, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.050537109375, + "learning_rate": 0.000819545949183788, + "loss": 0.01, + "macro_f1": 0.5934640765190125, + "num_tokens": 5380593.0, + "repeat_count": 0.0, + "routers_loss": 0.038378193974494934, + "skip_count": 3.0, + "step": 3336, + "text_loss": 0.2431795746088028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.671558555914293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008193078294160874, + "loss": 0.0097, + "macro_f1": 1.0, + "num_tokens": 5384487.0, + "repeat_count": 1.0, + "routers_loss": 0.005926199723035097, + "skip_count": 1.0, + "step": 3338, + "text_loss": 0.5663705468177795 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.680950983269739, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0008190695872890242, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5387511.0, + "repeat_count": 0.0, + "routers_loss": 0.010842559859156609, + "skip_count": 2.0, + "step": 3340, + "text_loss": 0.11517292261123657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.690343410625184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0008188312228938933, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 5390698.0, + "repeat_count": 0.0, + "routers_loss": 0.001304097007960081, + "skip_count": 0.0, + "step": 3342, + "text_loss": 0.4827076196670532 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.699735837980628, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008185927363220363, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5393778.0, + "repeat_count": 1.0, + "routers_loss": 0.005354117136448622, + "skip_count": 0.0, + "step": 3344, + "text_loss": 0.44467049837112427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008183541276648418, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5396925.0, + "repeat_count": 0.0, + "routers_loss": 0.004800073802471161, + "skip_count": 2.0, + "step": 3346, + "text_loss": 0.2032834142446518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.718520692691518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0008181153970137449, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5400522.0, + "repeat_count": 0.0, + "routers_loss": 0.0021674633026123047, + "skip_count": 0.0, + "step": 3348, + "text_loss": 0.4507528841495514 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.727913120046962, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008178765444602278, + "loss": 0.0117, + "macro_f1": 0.8820862174034119, + "num_tokens": 5403526.0, + "repeat_count": 2.0, + "routers_loss": 0.04263930395245552, + "skip_count": 2.0, + "step": 3350, + "text_loss": 0.3606615960597992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 15.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008176375700958194, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5407127.0, + "repeat_count": 1.0, + "routers_loss": 0.006953123956918716, + "skip_count": 0.0, + "step": 3352, + "text_loss": 0.2290353775024414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008173984740120948, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5410829.0, + "repeat_count": 0.0, + "routers_loss": 0.0014363783411681652, + "skip_count": 0.0, + "step": 3354, + "text_loss": 0.4220392405986786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.756090402113296, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008171592563006762, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5414152.0, + "repeat_count": 0.0, + "routers_loss": 0.00202389364130795, + "skip_count": 1.0, + "step": 3356, + "text_loss": 0.37729766964912415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.765482829468741, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008169199170532323, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 5417312.0, + "repeat_count": 0.0, + "routers_loss": 0.006253739818930626, + "skip_count": 2.0, + "step": 3358, + "text_loss": 0.1304289996623993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0008166804563614785, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 5421227.0, + "repeat_count": 2.0, + "routers_loss": 0.01622140221297741, + "skip_count": 2.0, + "step": 3360, + "text_loss": 0.298664391040802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0008164408743171763, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5424646.0, + "repeat_count": 1.0, + "routers_loss": 0.0037176944315433502, + "skip_count": 2.0, + "step": 3362, + "text_loss": 0.12147632241249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008162011710121339, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 5427897.0, + "repeat_count": 0.0, + "routers_loss": 0.0020403533708304167, + "skip_count": 1.0, + "step": 3364, + "text_loss": 0.2656533420085907 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.803052538890519, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008159613465382066, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5430474.0, + "repeat_count": 0.0, + "routers_loss": 0.0018634048756211996, + "skip_count": 0.0, + "step": 3366, + "text_loss": 0.9133086204528809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.812444966245964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0008157214009872951, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 5433113.0, + "repeat_count": 0.0, + "routers_loss": 0.012944488786160946, + "skip_count": 2.0, + "step": 3368, + "text_loss": 0.24352453649044037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008154813344513472, + "loss": 0.0143, + "macro_f1": 0.6666666865348816, + "num_tokens": 5436259.0, + "repeat_count": 0.0, + "routers_loss": 0.002347963862121105, + "skip_count": 2.0, + "step": 3370, + "text_loss": 0.7601244449615479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0008152411470223568, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5439126.0, + "repeat_count": 0.0, + "routers_loss": 0.0016609140438959002, + "skip_count": 0.0, + "step": 3372, + "text_loss": 0.5551947355270386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.840622248312298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008150008387923643, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5442739.0, + "repeat_count": 0.0, + "routers_loss": 0.008321396075189114, + "skip_count": 0.0, + "step": 3374, + "text_loss": 0.25028282403945923 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 15.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08544921875, + "learning_rate": 0.000814760409853456, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 5445247.0, + "repeat_count": 2.0, + "routers_loss": 0.009738070890307426, + "skip_count": 1.0, + "step": 3376, + "text_loss": 0.37271201610565186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008145198602977651, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5449044.0, + "repeat_count": 0.0, + "routers_loss": 0.0028421466704458, + "skip_count": 0.0, + "step": 3378, + "text_loss": 0.1458655595779419 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.868799530378633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11474609375, + "learning_rate": 0.0008142791902174701, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 5453063.0, + "repeat_count": 0.0, + "routers_loss": 0.0015170135302469134, + "skip_count": 0.0, + "step": 3380, + "text_loss": 0.5548722743988037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 15.878191957734076, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0008140383997047966, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 5455814.0, + "repeat_count": 0.0, + "routers_loss": 0.0022444510832428932, + "skip_count": 1.0, + "step": 3382, + "text_loss": 0.8034513592720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.887584385089522, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000813797488852016, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5459392.0, + "repeat_count": 0.0, + "routers_loss": 0.00038578867679461837, + "skip_count": 0.0, + "step": 3384, + "text_loss": 0.6940088868141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.896976812444967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008135564577514458, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5462413.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727381877601147, + "skip_count": 0.0, + "step": 3386, + "text_loss": 0.5124650597572327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.099609375, + "learning_rate": 0.0008133153064954495, + "loss": 0.0107, + "macro_f1": 0.3333333432674408, + "num_tokens": 5465552.0, + "repeat_count": 0.0, + "routers_loss": 0.0019896167796105146, + "skip_count": 0.0, + "step": 3388, + "text_loss": 0.4292517900466919 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 15.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008130740351764367, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 5468573.0, + "repeat_count": 1.0, + "routers_loss": 0.0030118159484118223, + "skip_count": 1.0, + "step": 3390, + "text_loss": 0.48903173208236694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 15.925154094511301, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000812832643886863, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5471547.0, + "repeat_count": 0.0, + "routers_loss": 0.005084246397018433, + "skip_count": 2.0, + "step": 3392, + "text_loss": 0.35789889097213745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.934546521866745, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008125911327192299, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5474331.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874498889781535, + "skip_count": 0.0, + "step": 3394, + "text_loss": 0.6267408728599548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008123495017660851, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5477633.0, + "repeat_count": 0.0, + "routers_loss": 0.001794386887922883, + "skip_count": 0.0, + "step": 3396, + "text_loss": 0.3701885938644409 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008121077511200221, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5481277.0, + "repeat_count": 0.0, + "routers_loss": 0.002140481723472476, + "skip_count": 0.0, + "step": 3398, + "text_loss": 0.6362857818603516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.962723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00081186588087368, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 5484237.0, + "repeat_count": 0.0, + "routers_loss": 0.000867189432028681, + "skip_count": 0.0, + "step": 3400, + "text_loss": 1.0847382545471191 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0008116238911197442, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5487423.0, + "repeat_count": 0.0, + "routers_loss": 0.0029817656613886356, + "skip_count": 0.0, + "step": 3402, + "text_loss": 0.3813740313053131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008113817819509454, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5490155.0, + "repeat_count": 0.0, + "routers_loss": 0.0035141287371516228, + "skip_count": 0.0, + "step": 3404, + "text_loss": 0.2113083451986313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 15.990901085999413, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008111395534600603, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5493415.0, + "repeat_count": 0.0, + "routers_loss": 0.003317659953609109, + "skip_count": 0.0, + "step": 3406, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008108972057399114, + "loss": 0.0123, + "macro_f1": 0.6666666865348816, + "num_tokens": 5496032.0, + "repeat_count": 0.0, + "routers_loss": 0.003833734430372715, + "skip_count": 2.0, + "step": 3408, + "text_loss": 0.2938928008079529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.11328125, + "learning_rate": 0.0008106547388833669, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5498890.0, + "repeat_count": 0.0, + "routers_loss": 0.002622978063300252, + "skip_count": 1.0, + "step": 3410, + "text_loss": 0.3130980432033539 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008104121529833402, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 5502010.0, + "repeat_count": 1.0, + "routers_loss": 0.007447598036378622, + "skip_count": 0.0, + "step": 3412, + "text_loss": 0.4413072466850281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.000810169448132791, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5505212.0, + "repeat_count": 0.0, + "routers_loss": 0.0031087708193808794, + "skip_count": 1.0, + "step": 3414, + "text_loss": 0.2910428047180176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.037569709421778, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008099266244247243, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5508755.0, + "repeat_count": 0.0, + "routers_loss": 0.02510393038392067, + "skip_count": 1.0, + "step": 3416, + "text_loss": 0.33022749423980713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0008096836819521903, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 5512034.0, + "repeat_count": 0.0, + "routers_loss": 0.0020537273958325386, + "skip_count": 1.0, + "step": 3418, + "text_loss": 0.4731218218803406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0008094406208082853, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5515707.0, + "repeat_count": 0.0, + "routers_loss": 0.004218162503093481, + "skip_count": 2.0, + "step": 3420, + "text_loss": 0.23429590463638306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 16.065746991488112, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0869140625, + "learning_rate": 0.0008091974410861507, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 5518436.0, + "repeat_count": 1.0, + "routers_loss": 0.013488355092704296, + "skip_count": 3.0, + "step": 3422, + "text_loss": 0.45768749713897705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0008089541428789733, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 5522368.0, + "repeat_count": 0.0, + "routers_loss": 0.0010335417464375496, + "skip_count": 1.0, + "step": 3424, + "text_loss": 0.43423423171043396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0008087107262799855, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 5526061.0, + "repeat_count": 0.0, + "routers_loss": 0.002134323585778475, + "skip_count": 0.0, + "step": 3426, + "text_loss": 0.4031757414340973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1318359375, + "learning_rate": 0.0008084671913824651, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5529284.0, + "repeat_count": 0.0, + "routers_loss": 0.0097216060385108, + "skip_count": 2.0, + "step": 3428, + "text_loss": 0.2836039960384369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.000808223538279735, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 5532159.0, + "repeat_count": 0.0, + "routers_loss": 0.001684269867837429, + "skip_count": 0.0, + "step": 3430, + "text_loss": 0.5804527401924133 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0008079797670651637, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 5536050.0, + "repeat_count": 1.0, + "routers_loss": 0.013918434269726276, + "skip_count": 1.0, + "step": 3432, + "text_loss": 0.31325826048851013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008077358778321647, + "loss": 0.011, + "macro_f1": 0.3333333432674408, + "num_tokens": 5538885.0, + "repeat_count": 0.0, + "routers_loss": 0.0007751787197776139, + "skip_count": 0.0, + "step": 3434, + "text_loss": 0.783108115196228 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.131493982976224, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008074918706741966, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 5541909.0, + "repeat_count": 3.0, + "routers_loss": 0.021819550544023514, + "skip_count": 2.0, + "step": 3436, + "text_loss": 0.6558083295822144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.14088641033167, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008072477456847638, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 5545101.0, + "repeat_count": 1.0, + "routers_loss": 0.03309348225593567, + "skip_count": 0.0, + "step": 3438, + "text_loss": 0.9877075552940369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008070035029574151, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 5548971.0, + "repeat_count": 1.0, + "routers_loss": 0.008696741424500942, + "skip_count": 1.0, + "step": 3440, + "text_loss": 0.24766330420970917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000806759142585745, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 5552174.0, + "repeat_count": 0.0, + "routers_loss": 0.004240929149091244, + "skip_count": 3.0, + "step": 3442, + "text_loss": 0.37255001068115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008065146646633927, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 5555005.0, + "repeat_count": 0.0, + "routers_loss": 0.014345484785735607, + "skip_count": 1.0, + "step": 3444, + "text_loss": 0.26157206296920776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008062700692840428, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5559127.0, + "repeat_count": 1.0, + "routers_loss": 0.008315163664519787, + "skip_count": 2.0, + "step": 3446, + "text_loss": 0.21971040964126587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 16.187848547108892, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008060253565414246, + "loss": 0.009, + "macro_f1": 0.5934640765190125, + "num_tokens": 5562254.0, + "repeat_count": 0.0, + "routers_loss": 0.009582413360476494, + "skip_count": 3.0, + "step": 3448, + "text_loss": 0.6758295893669128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008057805265293124, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 5565515.0, + "repeat_count": 0.0, + "routers_loss": 0.002429503947496414, + "skip_count": 0.0, + "step": 3450, + "text_loss": 0.696592390537262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0008055355793415257, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5568392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007724192109890282, + "skip_count": 0.0, + "step": 3452, + "text_loss": 0.7092870473861694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0008052905150719285, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 5571090.0, + "repeat_count": 0.0, + "routers_loss": 0.0010859938338398933, + "skip_count": 0.0, + "step": 3454, + "text_loss": 0.6593860387802124 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008050453338144301, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 5574552.0, + "repeat_count": 1.0, + "routers_loss": 0.0030258705373853445, + "skip_count": 1.0, + "step": 3456, + "text_loss": 0.3479384481906891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008048000356629844, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 5577484.0, + "repeat_count": 0.0, + "routers_loss": 0.005052885971963406, + "skip_count": 2.0, + "step": 3458, + "text_loss": 0.21858671307563782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0008045546207115901, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 5581605.0, + "repeat_count": 1.0, + "routers_loss": 0.009976249188184738, + "skip_count": 3.0, + "step": 3460, + "text_loss": 0.16868001222610474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0008043090890542904, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5584994.0, + "repeat_count": 0.0, + "routers_loss": 0.00270817126147449, + "skip_count": 0.0, + "step": 3462, + "text_loss": 0.785690426826477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0008040634407851739, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5588067.0, + "repeat_count": 0.0, + "routers_loss": 0.0018436965765431523, + "skip_count": 0.0, + "step": 3464, + "text_loss": 0.5006644129753113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008038176759983731, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5590789.0, + "repeat_count": 0.0, + "routers_loss": 0.008516279980540276, + "skip_count": 2.0, + "step": 3466, + "text_loss": 0.20963478088378906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008035717947880659, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 5593472.0, + "repeat_count": 0.0, + "routers_loss": 0.0016293043736368418, + "skip_count": 0.0, + "step": 3468, + "text_loss": 0.7376078963279724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0008033257972484742, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 5596108.0, + "repeat_count": 0.0, + "routers_loss": 0.002364142332226038, + "skip_count": 0.0, + "step": 3470, + "text_loss": 0.5156455039978027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008030796834738649, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 5599103.0, + "repeat_count": 0.0, + "routers_loss": 0.008872323669493198, + "skip_count": 0.0, + "step": 3472, + "text_loss": 0.2996419668197632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008028334535585491, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 5602410.0, + "repeat_count": 0.0, + "routers_loss": 0.011508257128298283, + "skip_count": 3.0, + "step": 3474, + "text_loss": 0.25438693165779114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0008025871075968827, + "loss": 0.0106, + "macro_f1": 1.0, + "num_tokens": 5605424.0, + "repeat_count": 2.0, + "routers_loss": 0.017225435003638268, + "skip_count": 2.0, + "step": 3476, + "text_loss": 0.2549574077129364 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.328734957440563, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0008023406456832657, + "loss": 0.0111, + "macro_f1": 0.9262410998344421, + "num_tokens": 5608266.0, + "repeat_count": 3.0, + "routers_loss": 0.039165645837783813, + "skip_count": 2.0, + "step": 3478, + "text_loss": 0.1797947734594345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0008020940679121429, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5611471.0, + "repeat_count": 0.0, + "routers_loss": 0.0009718866203911602, + "skip_count": 0.0, + "step": 3480, + "text_loss": 0.8267702460289001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008018473743780036, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5615046.0, + "repeat_count": 0.0, + "routers_loss": 0.006087122485041618, + "skip_count": 2.0, + "step": 3482, + "text_loss": 0.7267677187919617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000801600565175381, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 5618350.0, + "repeat_count": 0.0, + "routers_loss": 0.0007539413054473698, + "skip_count": 0.0, + "step": 3484, + "text_loss": 0.5910211801528931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008013536403988529, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 5621381.0, + "repeat_count": 0.0, + "routers_loss": 0.0008076327503658831, + "skip_count": 0.0, + "step": 3486, + "text_loss": 0.30616798996925354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 16.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008011066001430412, + "loss": 0.0086, + "macro_f1": 0.6122449040412903, + "num_tokens": 5624617.0, + "repeat_count": 0.0, + "routers_loss": 0.023835813626646996, + "skip_count": 4.0, + "step": 3488, + "text_loss": 0.3376443088054657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0008008594445026122, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5627989.0, + "repeat_count": 0.0, + "routers_loss": 0.004226419143378735, + "skip_count": 2.0, + "step": 3490, + "text_loss": 0.8185343146324158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.394481948928675, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008006121735722767, + "loss": 0.0084, + "macro_f1": 0.32098764181137085, + "num_tokens": 5632286.0, + "repeat_count": 0.0, + "routers_loss": 0.0366671048104763, + "skip_count": 2.0, + "step": 3492, + "text_loss": 0.2209547609090805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008003647874467892, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 5635368.0, + "repeat_count": 1.0, + "routers_loss": 0.012956378981471062, + "skip_count": 0.0, + "step": 3494, + "text_loss": 0.20468664169311523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008001172862209485, + "loss": 0.0103, + "macro_f1": 0.6666666865348816, + "num_tokens": 5638440.0, + "repeat_count": 1.0, + "routers_loss": 0.0017375422175973654, + "skip_count": 0.0, + "step": 3496, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.42265923099501, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007998696699895976, + "loss": 0.0091, + "macro_f1": 0.6592592597007751, + "num_tokens": 5641996.0, + "repeat_count": 1.0, + "routers_loss": 0.025240756571292877, + "skip_count": 5.0, + "step": 3498, + "text_loss": 0.23892143368721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0007996219388476236, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5645071.0, + "repeat_count": 0.0, + "routers_loss": 0.007436830550432205, + "skip_count": 1.0, + "step": 3500, + "text_loss": 0.7580804228782654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007993740928899571, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 5648175.0, + "repeat_count": 0.0, + "routers_loss": 0.001126602990552783, + "skip_count": 0.0, + "step": 3502, + "text_loss": 0.5281378626823425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007991261322115737, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 5650973.0, + "repeat_count": 0.0, + "routers_loss": 0.0007907263352535665, + "skip_count": 0.0, + "step": 3504, + "text_loss": 0.25220927596092224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.000798878056907492, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 5654252.0, + "repeat_count": 2.0, + "routers_loss": 0.006263538729399443, + "skip_count": 2.0, + "step": 3506, + "text_loss": 0.46569153666496277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0703125, + "learning_rate": 0.0007986298670727752, + "loss": 0.0098, + "macro_f1": 0.6666666865348816, + "num_tokens": 5657229.0, + "repeat_count": 0.0, + "routers_loss": 0.004049144219607115, + "skip_count": 3.0, + "step": 3508, + "text_loss": 0.15174436569213867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 16.479013795127678, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0791015625, + "learning_rate": 0.0007983815628025301, + "loss": 0.0074, + "macro_f1": 0.9262410998344421, + "num_tokens": 5659974.0, + "repeat_count": 2.0, + "routers_loss": 0.0471976138651371, + "skip_count": 3.0, + "step": 3510, + "text_loss": 0.39072203636169434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.488406222483125, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000798133144191907, + "loss": 0.0082, + "macro_f1": 0.3272727429866791, + "num_tokens": 5662893.0, + "repeat_count": 0.0, + "routers_loss": 0.04030488431453705, + "skip_count": 1.0, + "step": 3512, + "text_loss": 0.3562147617340088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007978846113361009, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5666476.0, + "repeat_count": 0.0, + "routers_loss": 0.007475079502910376, + "skip_count": 1.0, + "step": 3514, + "text_loss": 0.26518192887306213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007976359643303497, + "loss": 0.013, + "macro_f1": 0.6666666865348816, + "num_tokens": 5669647.0, + "repeat_count": 0.0, + "routers_loss": 0.00558585487306118, + "skip_count": 2.0, + "step": 3516, + "text_loss": 0.29284560680389404 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007973872032699354, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 5673491.0, + "repeat_count": 1.0, + "routers_loss": 0.0026981087867170572, + "skip_count": 1.0, + "step": 3518, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.000797138328250184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5676529.0, + "repeat_count": 1.0, + "routers_loss": 0.0027328627184033394, + "skip_count": 0.0, + "step": 3520, + "text_loss": 0.41077399253845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 16.535368359260346, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007968893393664646, + "loss": 0.01, + "macro_f1": 0.6592592597007751, + "num_tokens": 5679987.0, + "repeat_count": 1.0, + "routers_loss": 0.02695014327764511, + "skip_count": 5.0, + "step": 3522, + "text_loss": 0.44942837953567505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007966402367141903, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 5683185.0, + "repeat_count": 0.0, + "routers_loss": 0.00817026849836111, + "skip_count": 2.0, + "step": 3524, + "text_loss": 0.14528048038482666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007963910203888176, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 5686544.0, + "repeat_count": 0.0, + "routers_loss": 0.0021973433904349804, + "skip_count": 0.0, + "step": 3526, + "text_loss": 0.22358648478984833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.56354564132668, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007961416904858469, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 5689579.0, + "repeat_count": 0.0, + "routers_loss": 0.033712416887283325, + "skip_count": 1.0, + "step": 3528, + "text_loss": 0.3083649277687073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007958922471008217, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5692869.0, + "repeat_count": 0.0, + "routers_loss": 0.011182719841599464, + "skip_count": 2.0, + "step": 3530, + "text_loss": 0.21288011968135834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0007956426903293292, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5696007.0, + "repeat_count": 0.0, + "routers_loss": 0.0015808293828740716, + "skip_count": 0.0, + "step": 3532, + "text_loss": 0.6068631410598755 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.591722923393014, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007953930202670001, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 5699474.0, + "repeat_count": 2.0, + "routers_loss": 0.03205178305506706, + "skip_count": 0.0, + "step": 3534, + "text_loss": 0.4317135512828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007951432370095084, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 5703483.0, + "repeat_count": 0.0, + "routers_loss": 0.003518853336572647, + "skip_count": 0.0, + "step": 3536, + "text_loss": 0.5432273149490356 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 16.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007948933406525715, + "loss": 0.01, + "macro_f1": 1.0, + "num_tokens": 5707301.0, + "repeat_count": 1.0, + "routers_loss": 0.004982157610356808, + "skip_count": 1.0, + "step": 3538, + "text_loss": 0.40061065554618835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007946433312919502, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5710847.0, + "repeat_count": 0.0, + "routers_loss": 0.003067734418436885, + "skip_count": 0.0, + "step": 3540, + "text_loss": 0.5396234393119812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 16.629292632814792, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007943932090234486, + "loss": 0.0097, + "macro_f1": 0.5492662787437439, + "num_tokens": 5713683.0, + "repeat_count": 0.0, + "routers_loss": 0.03728383034467697, + "skip_count": 2.0, + "step": 3542, + "text_loss": 0.18310914933681488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 16.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007941429739429138, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 5716397.0, + "repeat_count": 0.0, + "routers_loss": 0.0025092530995607376, + "skip_count": 3.0, + "step": 3544, + "text_loss": 0.5806207060813904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007938926261462366, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5719984.0, + "repeat_count": 0.0, + "routers_loss": 0.002493767999112606, + "skip_count": 0.0, + "step": 3546, + "text_loss": 0.38606807589530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.657469914881126, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05078125, + "learning_rate": 0.0007936421657293507, + "loss": 0.0094, + "macro_f1": 0.8823530077934265, + "num_tokens": 5723571.0, + "repeat_count": 1.0, + "routers_loss": 0.014810923486948013, + "skip_count": 2.0, + "step": 3548, + "text_loss": 0.49558472633361816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0007933915927882327, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 5726405.0, + "repeat_count": 0.0, + "routers_loss": 0.00152928801253438, + "skip_count": 0.0, + "step": 3550, + "text_loss": 0.8674797415733337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000793140907418903, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 5729955.0, + "repeat_count": 0.0, + "routers_loss": 0.005522782914340496, + "skip_count": 2.0, + "step": 3552, + "text_loss": 0.3274473249912262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007928901097174248, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5733030.0, + "repeat_count": 0.0, + "routers_loss": 0.009207013063132763, + "skip_count": 2.0, + "step": 3554, + "text_loss": 0.18237128853797913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007926391997799039, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5735978.0, + "repeat_count": 0.0, + "routers_loss": 0.00695531303063035, + "skip_count": 0.0, + "step": 3556, + "text_loss": 0.3266434967517853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007923881777024898, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 5738901.0, + "repeat_count": 0.0, + "routers_loss": 0.002743212040513754, + "skip_count": 1.0, + "step": 3558, + "text_loss": 0.4971913695335388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007921370435813741, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 5741946.0, + "repeat_count": 1.0, + "routers_loss": 0.007037297356873751, + "skip_count": 0.0, + "step": 3560, + "text_loss": 0.5645473599433899 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007918857975127924, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5744987.0, + "repeat_count": 0.0, + "routers_loss": 0.0030746585689485073, + "skip_count": 0.0, + "step": 3562, + "text_loss": 0.17717665433883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007916344395930224, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 5747837.0, + "repeat_count": 0.0, + "routers_loss": 0.004522138275206089, + "skip_count": 0.0, + "step": 3564, + "text_loss": 0.7676118612289429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.000791382969918385, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 5750716.0, + "repeat_count": 0.0, + "routers_loss": 0.0026240211445838213, + "skip_count": 0.0, + "step": 3566, + "text_loss": 0.4975173771381378 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.751394188435572, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.000791131388585244, + "loss": 0.011, + "macro_f1": 0.8820862174034119, + "num_tokens": 5754368.0, + "repeat_count": 2.0, + "routers_loss": 0.021831991150975227, + "skip_count": 2.0, + "step": 3568, + "text_loss": 0.9670342206954956 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007908796956900055, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5757076.0, + "repeat_count": 1.0, + "routers_loss": 0.0017586691537871957, + "skip_count": 0.0, + "step": 3570, + "text_loss": 0.3057977259159088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.000790627891329119, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 5760613.0, + "repeat_count": 0.0, + "routers_loss": 0.005515786819159985, + "skip_count": 0.0, + "step": 3572, + "text_loss": 0.5860086679458618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007903759755990763, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 5763557.0, + "repeat_count": 0.0, + "routers_loss": 0.004096484277397394, + "skip_count": 0.0, + "step": 3574, + "text_loss": 0.17175781726837158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.000790123948596412, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 5767430.0, + "repeat_count": 1.0, + "routers_loss": 0.005216122139245272, + "skip_count": 0.0, + "step": 3576, + "text_loss": 0.7520374059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007898718104177031, + "loss": 0.0108, + "macro_f1": 0.3333333432674408, + "num_tokens": 5770175.0, + "repeat_count": 0.0, + "routers_loss": 0.0037980107590556145, + "skip_count": 0.0, + "step": 3578, + "text_loss": 0.18117885291576385 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007896195611595699, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 5773032.0, + "repeat_count": 0.0, + "routers_loss": 0.003672175807878375, + "skip_count": 2.0, + "step": 3580, + "text_loss": 0.7241058349609375 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007893672009186744, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 5776077.0, + "repeat_count": 1.0, + "routers_loss": 0.01229850109666586, + "skip_count": 3.0, + "step": 3582, + "text_loss": 0.29140418767929077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007891147297917216, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 5779088.0, + "repeat_count": 1.0, + "routers_loss": 0.0035251814406365156, + "skip_count": 0.0, + "step": 3584, + "text_loss": 0.1727485954761505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055908203125, + "learning_rate": 0.000788862147875459, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 5782201.0, + "repeat_count": 0.0, + "routers_loss": 0.004725661128759384, + "skip_count": 2.0, + "step": 3586, + "text_loss": 0.43512848019599915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007886094552666765, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5785039.0, + "repeat_count": 0.0, + "routers_loss": 0.005632172804325819, + "skip_count": 0.0, + "step": 3588, + "text_loss": 0.3534786105155945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0007883566520622062, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 5788017.0, + "repeat_count": 0.0, + "routers_loss": 0.006249965168535709, + "skip_count": 1.0, + "step": 3590, + "text_loss": 0.2089710384607315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007881037383589229, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 5791168.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797614956274629, + "skip_count": 0.0, + "step": 3592, + "text_loss": 0.4349329471588135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007878507142537436, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 5793927.0, + "repeat_count": 0.0, + "routers_loss": 0.0019719740375876427, + "skip_count": 1.0, + "step": 3594, + "text_loss": 0.6087368726730347 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007875975798436274, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5797214.0, + "repeat_count": 1.0, + "routers_loss": 0.0037070370744913816, + "skip_count": 0.0, + "step": 3596, + "text_loss": 0.4258122444152832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007873443352255764, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5800691.0, + "repeat_count": 0.0, + "routers_loss": 0.008431311696767807, + "skip_count": 0.0, + "step": 3598, + "text_loss": 0.6006711721420288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007870909804966337, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5804712.0, + "repeat_count": 0.0, + "routers_loss": 0.0017720256000757217, + "skip_count": 0.0, + "step": 3600, + "text_loss": 0.6055042743682861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.911065453478134, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007868375157538861, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 5807670.0, + "repeat_count": 1.0, + "routers_loss": 0.010697763413190842, + "skip_count": 0.0, + "step": 3602, + "text_loss": 0.8039056658744812 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007865839410944611, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 5810880.0, + "repeat_count": 1.0, + "routers_loss": 0.0030022128485143185, + "skip_count": 0.0, + "step": 3604, + "text_loss": 0.596110463142395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 16.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007863302566155295, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 5814171.0, + "repeat_count": 0.0, + "routers_loss": 0.006257854867726564, + "skip_count": 2.0, + "step": 3606, + "text_loss": 0.5700319409370422 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0007860764624143031, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 5817607.0, + "repeat_count": 1.0, + "routers_loss": 0.004838473163545132, + "skip_count": 0.0, + "step": 3608, + "text_loss": 0.8319530487060547 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 16.94863516289991, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08154296875, + "learning_rate": 0.0007858225585880369, + "loss": 0.0067, + "macro_f1": 0.8823530077934265, + "num_tokens": 5821452.0, + "repeat_count": 1.0, + "routers_loss": 0.02173662930727005, + "skip_count": 2.0, + "step": 3610, + "text_loss": 0.3738477826118469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007855685452340269, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 5824683.0, + "repeat_count": 0.0, + "routers_loss": 0.0032719180453568697, + "skip_count": 0.0, + "step": 3612, + "text_loss": 0.4054839015007019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007853144224496118, + "loss": 0.0093, + "macro_f1": 0.3272727429866791, + "num_tokens": 5827860.0, + "repeat_count": 1.0, + "routers_loss": 0.032171256840229034, + "skip_count": 0.0, + "step": 3614, + "text_loss": 0.18112395703792572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 16.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007850601903321716, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5831651.0, + "repeat_count": 0.0, + "routers_loss": 0.013230946846306324, + "skip_count": 1.0, + "step": 3616, + "text_loss": 0.2698844075202942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 16.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000784805848979129, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5834369.0, + "repeat_count": 0.0, + "routers_loss": 0.00162619655020535, + "skip_count": 0.0, + "step": 3618, + "text_loss": 0.2430931180715561 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 16.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007845513984879477, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 5838102.0, + "repeat_count": 1.0, + "routers_loss": 0.002781603019684553, + "skip_count": 0.0, + "step": 3620, + "text_loss": 0.4968300759792328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007842968389561337, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 5841029.0, + "repeat_count": 0.0, + "routers_loss": 0.0023873315658420324, + "skip_count": 0.0, + "step": 3622, + "text_loss": 0.5842974781990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007840421704812346, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 5845158.0, + "repeat_count": 0.0, + "routers_loss": 0.00400173757225275, + "skip_count": 1.0, + "step": 3624, + "text_loss": 0.8312450647354126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00078378739316084, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 5849175.0, + "repeat_count": 0.0, + "routers_loss": 0.0004974664188921452, + "skip_count": 0.0, + "step": 3626, + "text_loss": 0.48637253046035767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 17.032873495744056, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.10693359375, + "learning_rate": 0.000783532507092581, + "loss": 0.0079, + "macro_f1": 0.9555556178092957, + "num_tokens": 5852020.0, + "repeat_count": 1.0, + "routers_loss": 0.02555239573121071, + "skip_count": 5.0, + "step": 3628, + "text_loss": 0.5407033562660217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007832775123741306, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 5854873.0, + "repeat_count": 0.0, + "routers_loss": 0.0025962977670133114, + "skip_count": 0.0, + "step": 3630, + "text_loss": 0.618230938911438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.000783022409103203, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 5858086.0, + "repeat_count": 0.0, + "routers_loss": 0.0029271875973790884, + "skip_count": 0.0, + "step": 3632, + "text_loss": 0.21259798109531403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007827671973775542, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5860886.0, + "repeat_count": 0.0, + "routers_loss": 0.004102068953216076, + "skip_count": 0.0, + "step": 3634, + "text_loss": 0.4991208016872406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0007825118772949819, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 5864291.0, + "repeat_count": 0.0, + "routers_loss": 0.0023497689981013536, + "skip_count": 1.0, + "step": 3636, + "text_loss": 0.3878401517868042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0007822564489533255, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 5867155.0, + "repeat_count": 0.0, + "routers_loss": 0.007680345326662064, + "skip_count": 2.0, + "step": 3638, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007820009124504653, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5870325.0, + "repeat_count": 0.0, + "routers_loss": 0.0008242831099778414, + "skip_count": 0.0, + "step": 3640, + "text_loss": 0.3552473187446594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007817452678843236, + "loss": 0.0073, + "macro_f1": 0.6601307392120361, + "num_tokens": 5873301.0, + "repeat_count": 1.0, + "routers_loss": 0.023831043392419815, + "skip_count": 2.0, + "step": 3642, + "text_loss": 0.18363867700099945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0007814895153528635, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5876225.0, + "repeat_count": 0.0, + "routers_loss": 0.001999989850446582, + "skip_count": 0.0, + "step": 3644, + "text_loss": 0.17581747472286224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007812336549540903, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 5879501.0, + "repeat_count": 0.0, + "routers_loss": 0.001098626758903265, + "skip_count": 0.0, + "step": 3646, + "text_loss": 0.5040884613990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.126797769298502, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007809776867860499, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 5882608.0, + "repeat_count": 0.0, + "routers_loss": 0.012210183776915073, + "skip_count": 1.0, + "step": 3648, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00078072161094683, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 5886106.0, + "repeat_count": 0.0, + "routers_loss": 0.005191771313548088, + "skip_count": 2.0, + "step": 3650, + "text_loss": 0.5167917609214783 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007804654275345591, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 5889122.0, + "repeat_count": 0.0, + "routers_loss": 0.0016411367105320096, + "skip_count": 1.0, + "step": 3652, + "text_loss": 0.7691274285316467 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.154975051364836, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0007802091366474074, + "loss": 0.005, + "macro_f1": 0.8823530077934265, + "num_tokens": 5892313.0, + "repeat_count": 2.0, + "routers_loss": 0.015627093613147736, + "skip_count": 1.0, + "step": 3654, + "text_loss": 0.4646325409412384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007799527383835858, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 5895577.0, + "repeat_count": 0.0, + "routers_loss": 0.0009879748104140162, + "skip_count": 0.0, + "step": 3656, + "text_loss": 0.5587969422340393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007796962328413469, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 5898546.0, + "repeat_count": 0.0, + "routers_loss": 0.004864919930696487, + "skip_count": 0.0, + "step": 3658, + "text_loss": 0.6981375813484192 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007794396201189839, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 5901618.0, + "repeat_count": 1.0, + "routers_loss": 0.006617432460188866, + "skip_count": 2.0, + "step": 3660, + "text_loss": 0.22521957755088806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.192544760786618, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007791829003148312, + "loss": 0.0098, + "macro_f1": 0.6601307392120361, + "num_tokens": 5904540.0, + "repeat_count": 1.0, + "routers_loss": 0.0782252699136734, + "skip_count": 2.0, + "step": 3662, + "text_loss": 0.2649642825126648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0007789260735272647, + "loss": 0.0114, + "macro_f1": 0.3333333432674408, + "num_tokens": 5907827.0, + "repeat_count": 0.0, + "routers_loss": 0.0012057392159476876, + "skip_count": 0.0, + "step": 3664, + "text_loss": 0.6943771243095398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007786691398547005, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 5911163.0, + "repeat_count": 0.0, + "routers_loss": 0.007476957980543375, + "skip_count": 2.0, + "step": 3666, + "text_loss": 0.1502683162689209 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007784120993955962, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 5913948.0, + "repeat_count": 1.0, + "routers_loss": 0.004082011990249157, + "skip_count": 0.0, + "step": 3668, + "text_loss": 0.4127517640590668 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 17.230114470208395, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007781549522484503, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 5917360.0, + "repeat_count": 3.0, + "routers_loss": 0.027505695819854736, + "skip_count": 1.0, + "step": 3670, + "text_loss": 0.23892618715763092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007778976985118018, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 5920524.0, + "repeat_count": 0.0, + "routers_loss": 0.0024977331049740314, + "skip_count": 2.0, + "step": 3672, + "text_loss": 0.5076471567153931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007776403382842312, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 5923632.0, + "repeat_count": 0.0, + "routers_loss": 0.0015700991498306394, + "skip_count": 0.0, + "step": 3674, + "text_loss": 0.6287924647331238 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.25829175227473, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007773828716643591, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 5926438.0, + "repeat_count": 1.0, + "routers_loss": 0.05108916014432907, + "skip_count": 0.0, + "step": 3676, + "text_loss": 0.26517006754875183 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007771252987508474, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 5930081.0, + "repeat_count": 0.0, + "routers_loss": 0.003439917229115963, + "skip_count": 0.0, + "step": 3678, + "text_loss": 0.5189079642295837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007768676196423984, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 5933463.0, + "repeat_count": 1.0, + "routers_loss": 0.001935846172273159, + "skip_count": 1.0, + "step": 3680, + "text_loss": 0.6703575849533081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 17.286469034341064, + "f1_execute": 0.9433962106704712, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007766098344377553, + "loss": 0.0082, + "macro_f1": 0.31446540355682373, + "num_tokens": 5937098.0, + "repeat_count": 0.0, + "routers_loss": 0.0384826585650444, + "skip_count": 2.0, + "step": 3682, + "text_loss": 0.6424444913864136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0007763519432357018, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 5940436.0, + "repeat_count": 0.0, + "routers_loss": 0.0008654671837575734, + "skip_count": 0.0, + "step": 3684, + "text_loss": 0.4189988672733307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007760939461350623, + "loss": 0.0111, + "macro_f1": 0.6666666865348816, + "num_tokens": 5943731.0, + "repeat_count": 0.0, + "routers_loss": 0.007468715775758028, + "skip_count": 2.0, + "step": 3686, + "text_loss": 0.2875453233718872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0007758358432347019, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 5946707.0, + "repeat_count": 0.0, + "routers_loss": 0.001252831774763763, + "skip_count": 0.0, + "step": 3688, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007755776346335259, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 5949833.0, + "repeat_count": 0.0, + "routers_loss": 0.001680848654359579, + "skip_count": 0.0, + "step": 3690, + "text_loss": 0.4031114876270294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007753193204304807, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 5953095.0, + "repeat_count": 0.0, + "routers_loss": 0.0047258250415325165, + "skip_count": 2.0, + "step": 3692, + "text_loss": 0.17632785439491272 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007750609007245524, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 5955971.0, + "repeat_count": 2.0, + "routers_loss": 0.001980359200388193, + "skip_count": 4.0, + "step": 3694, + "text_loss": 0.3423727750778198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007748023756147679, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 5958948.0, + "repeat_count": 0.0, + "routers_loss": 0.00511702848598361, + "skip_count": 0.0, + "step": 3696, + "text_loss": 0.28279972076416016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007745437452001949, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 5961819.0, + "repeat_count": 0.0, + "routers_loss": 0.0005220443126745522, + "skip_count": 0.0, + "step": 3698, + "text_loss": 0.4793325662612915 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007742850095799408, + "loss": 0.0084, + "macro_f1": 0.3272727429866791, + "num_tokens": 5964625.0, + "repeat_count": 1.0, + "routers_loss": 0.06411020457744598, + "skip_count": 0.0, + "step": 3700, + "text_loss": 0.2825184464454651 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007740261688531536, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 5967134.0, + "repeat_count": 0.0, + "routers_loss": 0.004408109001815319, + "skip_count": 3.0, + "step": 3702, + "text_loss": 0.690429151058197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0007737672231190215, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 5969831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006747521692886949, + "skip_count": 0.0, + "step": 3704, + "text_loss": 0.32556024193763733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007735081724767732, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 5973015.0, + "repeat_count": 0.0, + "routers_loss": 0.0020414739847183228, + "skip_count": 0.0, + "step": 3706, + "text_loss": 0.5876469612121582 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.072265625, + "learning_rate": 0.0007732490170256769, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 5975778.0, + "repeat_count": 1.0, + "routers_loss": 0.005610425490885973, + "skip_count": 0.0, + "step": 3708, + "text_loss": 0.2968577444553375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007729897568650422, + "loss": 0.0097, + "macro_f1": 0.3333333432674408, + "num_tokens": 5979115.0, + "repeat_count": 0.0, + "routers_loss": 0.001248046406544745, + "skip_count": 0.0, + "step": 3710, + "text_loss": 0.626361608505249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007727303920942176, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 5982213.0, + "repeat_count": 0.0, + "routers_loss": 0.005791695322841406, + "skip_count": 2.0, + "step": 3712, + "text_loss": 0.4133484661579132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 17.436747872028178, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007724709228125922, + "loss": 0.0105, + "macro_f1": 0.5492662787437439, + "num_tokens": 5984930.0, + "repeat_count": 0.0, + "routers_loss": 0.02114664763212204, + "skip_count": 2.0, + "step": 3714, + "text_loss": 0.4646461308002472 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007722113491195952, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 5988017.0, + "repeat_count": 2.0, + "routers_loss": 0.005913930479437113, + "skip_count": 5.0, + "step": 3716, + "text_loss": 0.15474505722522736 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007719516711146957, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 5991562.0, + "repeat_count": 0.0, + "routers_loss": 0.0075925313867628574, + "skip_count": 2.0, + "step": 3718, + "text_loss": 0.5293686985969543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.000771691888897403, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 5994675.0, + "repeat_count": 0.0, + "routers_loss": 0.0012335237115621567, + "skip_count": 0.0, + "step": 3720, + "text_loss": 0.5210637450218201 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007714320025672657, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 5999070.0, + "repeat_count": 0.0, + "routers_loss": 0.010582062415778637, + "skip_count": 2.0, + "step": 3722, + "text_loss": 0.2783571779727936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.4837100088054, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000771172012223873, + "loss": 0.0078, + "macro_f1": 0.6598639488220215, + "num_tokens": 6002702.0, + "repeat_count": 1.0, + "routers_loss": 0.015008784830570221, + "skip_count": 3.0, + "step": 3724, + "text_loss": 0.358705073595047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007709119179668538, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6005517.0, + "repeat_count": 0.0, + "routers_loss": 0.00111615180503577, + "skip_count": 0.0, + "step": 3726, + "text_loss": 0.45202162861824036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 17.50249486351629, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007706517198958764, + "loss": 0.0096, + "macro_f1": 0.6595745086669922, + "num_tokens": 6009111.0, + "repeat_count": 1.0, + "routers_loss": 0.05215252563357353, + "skip_count": 4.0, + "step": 3728, + "text_loss": 0.20360413193702698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007703914181106497, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6012989.0, + "repeat_count": 0.0, + "routers_loss": 0.010039499960839748, + "skip_count": 3.0, + "step": 3730, + "text_loss": 0.20334361493587494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007701310127109211, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6016420.0, + "repeat_count": 0.0, + "routers_loss": 0.01090205181390047, + "skip_count": 1.0, + "step": 3732, + "text_loss": 0.47959551215171814 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 17.530672145582624, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007698705037964791, + "loss": 0.0076, + "macro_f1": 0.6225374937057495, + "num_tokens": 6019551.0, + "repeat_count": 0.0, + "routers_loss": 0.02677762135863304, + "skip_count": 5.0, + "step": 3734, + "text_loss": 0.2621438801288605 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.000769609891467151, + "loss": 0.0119, + "macro_f1": 0.6666666865348816, + "num_tokens": 6022262.0, + "repeat_count": 1.0, + "routers_loss": 0.00460716662928462, + "skip_count": 0.0, + "step": 3736, + "text_loss": 0.3433022201061249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007693491758228037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6025723.0, + "repeat_count": 0.0, + "routers_loss": 0.0036111194640398026, + "skip_count": 2.0, + "step": 3738, + "text_loss": 0.38703784346580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007690883569633442, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6028652.0, + "repeat_count": 0.0, + "routers_loss": 0.003299296135082841, + "skip_count": 0.0, + "step": 3740, + "text_loss": 0.24203069508075714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0007688274349887188, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 6032280.0, + "repeat_count": 0.0, + "routers_loss": 0.003173880511894822, + "skip_count": 0.0, + "step": 3742, + "text_loss": 0.2827291488647461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0007685664099989131, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6035111.0, + "repeat_count": 0.0, + "routers_loss": 0.0008576177642680705, + "skip_count": 0.0, + "step": 3744, + "text_loss": 0.43613526225090027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007683052820939524, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6038428.0, + "repeat_count": 0.0, + "routers_loss": 0.004335585981607437, + "skip_count": 2.0, + "step": 3746, + "text_loss": 1.0385624170303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007680440513739015, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6041185.0, + "repeat_count": 0.0, + "routers_loss": 0.0008210531086660922, + "skip_count": 0.0, + "step": 3748, + "text_loss": 0.7070431709289551 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007677827179388646, + "loss": 0.0089, + "macro_f1": 1.0, + "num_tokens": 6046333.0, + "repeat_count": 1.0, + "routers_loss": 0.003778942162171006, + "skip_count": 1.0, + "step": 3750, + "text_loss": 0.3682238757610321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 17.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08984375, + "learning_rate": 0.000767521281888985, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 6049528.0, + "repeat_count": 1.0, + "routers_loss": 0.002767334459349513, + "skip_count": 1.0, + "step": 3752, + "text_loss": 0.7619418501853943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0007672597433244455, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 6053202.0, + "repeat_count": 0.0, + "routers_loss": 0.004796457476913929, + "skip_count": 2.0, + "step": 3754, + "text_loss": 0.4157083034515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007669981023454682, + "loss": 0.0126, + "macro_f1": 0.3333333432674408, + "num_tokens": 6056609.0, + "repeat_count": 0.0, + "routers_loss": 0.0013067846884950995, + "skip_count": 0.0, + "step": 3756, + "text_loss": 0.4529118537902832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007667363590523142, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6060504.0, + "repeat_count": 0.0, + "routers_loss": 0.0010285493917763233, + "skip_count": 0.0, + "step": 3758, + "text_loss": 0.8363246321678162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007664745135452844, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6063526.0, + "repeat_count": 0.0, + "routers_loss": 0.006289863493293524, + "skip_count": 3.0, + "step": 3760, + "text_loss": 0.5313657522201538 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007662125659247183, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6067147.0, + "repeat_count": 0.0, + "routers_loss": 0.0028537956532090902, + "skip_count": 0.0, + "step": 3762, + "text_loss": 0.5668109059333801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007659505162909949, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6070350.0, + "repeat_count": 0.0, + "routers_loss": 0.0026814753655344248, + "skip_count": 0.0, + "step": 3764, + "text_loss": 0.4983512759208679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007656883647445318, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 6073091.0, + "repeat_count": 0.0, + "routers_loss": 0.005981382913887501, + "skip_count": 1.0, + "step": 3766, + "text_loss": 0.30372318625450134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0007654261113857863, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6076244.0, + "repeat_count": 0.0, + "routers_loss": 0.000803640519734472, + "skip_count": 0.0, + "step": 3768, + "text_loss": 0.6100738048553467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0007651637563152539, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 6078936.0, + "repeat_count": 0.0, + "routers_loss": 0.0013324898900464177, + "skip_count": 0.0, + "step": 3770, + "text_loss": 0.4733821153640747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007649012996334701, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6081951.0, + "repeat_count": 1.0, + "routers_loss": 0.0021543330512940884, + "skip_count": 0.0, + "step": 3772, + "text_loss": 0.6794875860214233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007646387414410085, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 6085165.0, + "repeat_count": 0.0, + "routers_loss": 0.0005426189745776355, + "skip_count": 0.0, + "step": 3774, + "text_loss": 0.5886107683181763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007643760818384819, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6088370.0, + "repeat_count": 0.0, + "routers_loss": 0.002537576947361231, + "skip_count": 0.0, + "step": 3776, + "text_loss": 0.23591920733451843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007641133209265423, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6092319.0, + "repeat_count": 0.0, + "routers_loss": 0.002613696036860347, + "skip_count": 0.0, + "step": 3778, + "text_loss": 0.3217754662036896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007638504588058796, + "loss": 0.0105, + "macro_f1": 0.3333333432674408, + "num_tokens": 6095799.0, + "repeat_count": 0.0, + "routers_loss": 0.0007219464750960469, + "skip_count": 0.0, + "step": 3780, + "text_loss": 0.4276983141899109 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 17.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0007635874955772234, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6098789.0, + "repeat_count": 0.0, + "routers_loss": 0.005965052172541618, + "skip_count": 3.0, + "step": 3782, + "text_loss": 0.30936646461486816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.0007633244313413417, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6101631.0, + "repeat_count": 0.0, + "routers_loss": 0.0007469559786841273, + "skip_count": 0.0, + "step": 3784, + "text_loss": 0.44460123777389526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007630612661990412, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 6105097.0, + "repeat_count": 0.0, + "routers_loss": 0.004300760570913553, + "skip_count": 1.0, + "step": 3786, + "text_loss": 0.41950157284736633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007627980002511672, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6107847.0, + "repeat_count": 0.0, + "routers_loss": 0.0023050960153341293, + "skip_count": 1.0, + "step": 3788, + "text_loss": 0.48561373353004456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0007625346335986039, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6110546.0, + "repeat_count": 0.0, + "routers_loss": 0.0018124044872820377, + "skip_count": 0.0, + "step": 3790, + "text_loss": 0.20882295072078705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007622711663422735, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6113600.0, + "repeat_count": 0.0, + "routers_loss": 0.0007613401976414025, + "skip_count": 0.0, + "step": 3792, + "text_loss": 0.31751760840415955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007620075985831375, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 6116916.0, + "repeat_count": 0.0, + "routers_loss": 0.005452962126582861, + "skip_count": 2.0, + "step": 3794, + "text_loss": 0.3246645927429199 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 17.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007617439304221956, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6120056.0, + "repeat_count": 2.0, + "routers_loss": 0.0043787881731987, + "skip_count": 0.0, + "step": 3796, + "text_loss": 0.4859195947647095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0007614801619604856, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6122668.0, + "repeat_count": 0.0, + "routers_loss": 0.0033891722559928894, + "skip_count": 0.0, + "step": 3798, + "text_loss": 0.48194369673728943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007612162932990845, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6126792.0, + "repeat_count": 0.0, + "routers_loss": 0.001883238204754889, + "skip_count": 0.0, + "step": 3800, + "text_loss": 0.3740062117576599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007609523245391068, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 6129801.0, + "repeat_count": 0.0, + "routers_loss": 0.00882677361369133, + "skip_count": 2.0, + "step": 3802, + "text_loss": 0.5759486556053162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007606882557817062, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6133613.0, + "repeat_count": 0.0, + "routers_loss": 0.009537030011415482, + "skip_count": 2.0, + "step": 3804, + "text_loss": 0.3217554986476898 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0007604240871280742, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6137784.0, + "repeat_count": 0.0, + "routers_loss": 0.0023913346230983734, + "skip_count": 0.0, + "step": 3806, + "text_loss": 0.3718445599079132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.878191957734078, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007601598186794407, + "loss": 0.0081, + "macro_f1": 0.6603773832321167, + "num_tokens": 6141356.0, + "repeat_count": 1.0, + "routers_loss": 0.033796411007642746, + "skip_count": 1.0, + "step": 3808, + "text_loss": 0.2717749774456024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000759895450537074, + "loss": 0.01, + "macro_f1": 0.6666666865348816, + "num_tokens": 6144448.0, + "repeat_count": 0.0, + "routers_loss": 0.0037919918540865183, + "skip_count": 2.0, + "step": 3810, + "text_loss": 0.5935076475143433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007596309828022803, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6147526.0, + "repeat_count": 0.0, + "routers_loss": 0.0008182782912626863, + "skip_count": 0.0, + "step": 3812, + "text_loss": 0.449336439371109 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 17.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0007593664155764044, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6150620.0, + "repeat_count": 1.0, + "routers_loss": 0.001734903547912836, + "skip_count": 0.0, + "step": 3814, + "text_loss": 0.6647221446037292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007591017489608286, + "loss": 0.0088, + "macro_f1": 0.3272727429866791, + "num_tokens": 6153714.0, + "repeat_count": 1.0, + "routers_loss": 0.04721754416823387, + "skip_count": 0.0, + "step": 3816, + "text_loss": 0.25481200218200684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007588369830569738, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6156974.0, + "repeat_count": 0.0, + "routers_loss": 0.0002484306460246444, + "skip_count": 0.0, + "step": 3818, + "text_loss": 0.7195295691490173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007585721179662988, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6159660.0, + "repeat_count": 0.0, + "routers_loss": 0.0051363613456487656, + "skip_count": 2.0, + "step": 3820, + "text_loss": 0.5073586702346802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0007583071537903005, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6163146.0, + "repeat_count": 0.0, + "routers_loss": 0.006719176657497883, + "skip_count": 0.0, + "step": 3822, + "text_loss": 0.6950558423995972 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 17.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0007580420906305136, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6166257.0, + "repeat_count": 1.0, + "routers_loss": 0.00871267355978489, + "skip_count": 3.0, + "step": 3824, + "text_loss": 0.2549148201942444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0007577769285885109, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 6169624.0, + "repeat_count": 0.0, + "routers_loss": 0.0015642556827515364, + "skip_count": 0.0, + "step": 3826, + "text_loss": 0.3720305860042572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007575116677659029, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6172673.0, + "repeat_count": 0.0, + "routers_loss": 0.0011551049537956715, + "skip_count": 0.0, + "step": 3828, + "text_loss": 0.6819429397583008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 17.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007572463082643377, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 6175414.0, + "repeat_count": 0.0, + "routers_loss": 0.0008922060951590538, + "skip_count": 0.0, + "step": 3830, + "text_loss": 0.5424665212631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 17.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007569808501855023, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 6178701.0, + "repeat_count": 0.0, + "routers_loss": 0.004167596809566021, + "skip_count": 1.0, + "step": 3832, + "text_loss": 0.4429764151573181 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00075671529363112, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6183036.0, + "repeat_count": 0.0, + "routers_loss": 0.0008732969872653484, + "skip_count": 0.0, + "step": 3834, + "text_loss": 0.8015334010124207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007564496387029531, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 6186325.0, + "repeat_count": 0.0, + "routers_loss": 0.0021374202333390713, + "skip_count": 1.0, + "step": 3836, + "text_loss": 0.4233771562576294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.000756183885502801, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6189919.0, + "repeat_count": 1.0, + "routers_loss": 0.004017227329313755, + "skip_count": 0.0, + "step": 3838, + "text_loss": 0.33691394329071045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.0007559180341325005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6193412.0, + "repeat_count": 0.0, + "routers_loss": 0.0013120946241542697, + "skip_count": 0.0, + "step": 3840, + "text_loss": 0.14970099925994873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.037569709421778, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007556520846939265, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 6196588.0, + "repeat_count": 0.0, + "routers_loss": 0.011793316341936588, + "skip_count": 2.0, + "step": 3842, + "text_loss": 0.2714047133922577 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007553860372889914, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6200841.0, + "repeat_count": 1.0, + "routers_loss": 0.019968654960393906, + "skip_count": 4.0, + "step": 3844, + "text_loss": 0.23680976033210754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 18.05635456413267, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007551198920196452, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 6203797.0, + "repeat_count": 0.0, + "routers_loss": 0.013615630567073822, + "skip_count": 2.0, + "step": 3846, + "text_loss": 0.25839608907699585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0546875, + "learning_rate": 0.000754853648987875, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6206790.0, + "repeat_count": 0.0, + "routers_loss": 0.002420815173536539, + "skip_count": 1.0, + "step": 3848, + "text_loss": 0.5358025431632996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 18.07513941884356, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007545873082957057, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 6209791.0, + "repeat_count": 1.0, + "routers_loss": 0.018236197531223297, + "skip_count": 3.0, + "step": 3850, + "text_loss": 0.1463700383901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007543208700451998, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6212792.0, + "repeat_count": 0.0, + "routers_loss": 0.006242573726922274, + "skip_count": 3.0, + "step": 3852, + "text_loss": 0.9441591501235962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007540543343384565, + "loss": 0.0062, + "macro_f1": 0.3272727429866791, + "num_tokens": 6215747.0, + "repeat_count": 0.0, + "routers_loss": 0.01451140083372593, + "skip_count": 1.0, + "step": 3854, + "text_loss": 0.41610902547836304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007537877012776132, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6218593.0, + "repeat_count": 0.0, + "routers_loss": 0.00037674361374229193, + "skip_count": 0.0, + "step": 3856, + "text_loss": 0.6048852205276489 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0007535209709648439, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 6221315.0, + "repeat_count": 1.0, + "routers_loss": 0.005776284262537956, + "skip_count": 3.0, + "step": 3858, + "text_loss": 0.35627537965774536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007532541435023605, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6225012.0, + "repeat_count": 0.0, + "routers_loss": 0.0009280376834794879, + "skip_count": 0.0, + "step": 3860, + "text_loss": 0.6440183520317078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0007529872189924114, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6227650.0, + "repeat_count": 0.0, + "routers_loss": 0.0009876530384644866, + "skip_count": 0.0, + "step": 3862, + "text_loss": 0.35507893562316895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.14088641033167, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0007527201975372827, + "loss": 0.0045, + "macro_f1": 0.6603773832321167, + "num_tokens": 6230557.0, + "repeat_count": 1.0, + "routers_loss": 0.013780162669718266, + "skip_count": 1.0, + "step": 3864, + "text_loss": 0.38958442211151123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007524530792392977, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 6233371.0, + "repeat_count": 0.0, + "routers_loss": 0.004849869292229414, + "skip_count": 3.0, + "step": 3866, + "text_loss": 0.3826720714569092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0007521858642008163, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 6236770.0, + "repeat_count": 0.0, + "routers_loss": 0.008618295192718506, + "skip_count": 1.0, + "step": 3868, + "text_loss": 0.3596078157424927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0007519185525242363, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6239661.0, + "repeat_count": 0.0, + "routers_loss": 0.0013421972980722785, + "skip_count": 0.0, + "step": 3870, + "text_loss": 0.5585550665855408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0007516511443119916, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6242459.0, + "repeat_count": 0.0, + "routers_loss": 0.0038009448908269405, + "skip_count": 1.0, + "step": 3872, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007513836396665534, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6245489.0, + "repeat_count": 1.0, + "routers_loss": 0.002785376040264964, + "skip_count": 2.0, + "step": 3874, + "text_loss": 0.551510751247406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0007511160386904305, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6249014.0, + "repeat_count": 0.0, + "routers_loss": 0.0021424589212983847, + "skip_count": 1.0, + "step": 3876, + "text_loss": 1.0502676963806152 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0007508483414861679, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6252357.0, + "repeat_count": 0.0, + "routers_loss": 0.0085759861394763, + "skip_count": 1.0, + "step": 3878, + "text_loss": 0.49212515354156494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007505805481563477, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6254975.0, + "repeat_count": 0.0, + "routers_loss": 0.0010723904706537724, + "skip_count": 0.0, + "step": 3880, + "text_loss": 0.7022985816001892 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0007503126588035887, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6258001.0, + "repeat_count": 1.0, + "routers_loss": 0.012809890322387218, + "skip_count": 2.0, + "step": 3882, + "text_loss": 0.1829151213169098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007500446735305466, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6261795.0, + "repeat_count": 0.0, + "routers_loss": 0.0026790346018970013, + "skip_count": 1.0, + "step": 3884, + "text_loss": 0.20436066389083862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000749776592439914, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 6265585.0, + "repeat_count": 1.0, + "routers_loss": 0.005243788007646799, + "skip_count": 2.0, + "step": 3886, + "text_loss": 0.4479229748249054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00074950841563442, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 6269039.0, + "repeat_count": 0.0, + "routers_loss": 0.007998534478247166, + "skip_count": 1.0, + "step": 3888, + "text_loss": 0.2154676914215088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0007492401432168303, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6272315.0, + "repeat_count": 0.0, + "routers_loss": 0.004648822825402021, + "skip_count": 1.0, + "step": 3890, + "text_loss": 0.3375042676925659 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.272380393307895, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007489717752899477, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6275342.0, + "repeat_count": 0.0, + "routers_loss": 0.012154200114309788, + "skip_count": 1.0, + "step": 3892, + "text_loss": 0.1964082419872284 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000748703311956611, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6278700.0, + "repeat_count": 1.0, + "routers_loss": 0.004610476549714804, + "skip_count": 2.0, + "step": 3894, + "text_loss": 0.26545581221580505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007484347533196961, + "loss": 0.0105, + "macro_f1": 0.6666666865348816, + "num_tokens": 6281864.0, + "repeat_count": 0.0, + "routers_loss": 0.0075586591847240925, + "skip_count": 2.0, + "step": 3896, + "text_loss": 0.3106999397277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0007481660994821151, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6284676.0, + "repeat_count": 0.0, + "routers_loss": 0.007845268584787846, + "skip_count": 1.0, + "step": 3898, + "text_loss": 0.4094304144382477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007478973505468165, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 6287470.0, + "repeat_count": 1.0, + "routers_loss": 0.011116391979157925, + "skip_count": 2.0, + "step": 3900, + "text_loss": 0.1838909536600113 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0007476285066167857, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 6290432.0, + "repeat_count": 1.0, + "routers_loss": 0.004599364474415779, + "skip_count": 0.0, + "step": 3902, + "text_loss": 0.25872838497161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007473595677950439, + "loss": 0.0109, + "macro_f1": 0.6666666865348816, + "num_tokens": 6293557.0, + "repeat_count": 0.0, + "routers_loss": 0.0016367282951250672, + "skip_count": 1.0, + "step": 3904, + "text_loss": 0.5272360444068909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007470905341846492, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 6295979.0, + "repeat_count": 0.0, + "routers_loss": 0.0004760588926728815, + "skip_count": 0.0, + "step": 3906, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007468214058886956, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6299215.0, + "repeat_count": 0.0, + "routers_loss": 0.000524883100297302, + "skip_count": 0.0, + "step": 3908, + "text_loss": 0.5144801139831543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0007465521830103137, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6302320.0, + "repeat_count": 0.0, + "routers_loss": 0.0016085522947832942, + "skip_count": 0.0, + "step": 3910, + "text_loss": 0.14342890679836273 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007462828656526702, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6305212.0, + "repeat_count": 0.0, + "routers_loss": 0.002720315707847476, + "skip_count": 2.0, + "step": 3912, + "text_loss": 0.31109121441841125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007460134539189681, + "loss": 0.0114, + "macro_f1": 0.6666666865348816, + "num_tokens": 6308964.0, + "repeat_count": 0.0, + "routers_loss": 0.0010418406454846263, + "skip_count": 1.0, + "step": 3914, + "text_loss": 0.5662030577659607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007457439479124459, + "loss": 0.0134, + "macro_f1": 0.3333333432674408, + "num_tokens": 6313195.0, + "repeat_count": 0.0, + "routers_loss": 0.0020303844939917326, + "skip_count": 0.0, + "step": 3916, + "text_loss": 0.6358339190483093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007454743477363797, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6315949.0, + "repeat_count": 0.0, + "routers_loss": 0.0006592223653569818, + "skip_count": 0.0, + "step": 3918, + "text_loss": 0.35648423433303833 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.403874376284122, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007452046534940803, + "loss": 0.0075, + "macro_f1": 0.6603773832321167, + "num_tokens": 6319024.0, + "repeat_count": 1.0, + "routers_loss": 0.024555351585149765, + "skip_count": 1.0, + "step": 3920, + "text_loss": 0.21955153346061707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007449348652888952, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6321633.0, + "repeat_count": 0.0, + "routers_loss": 0.003606822807341814, + "skip_count": 1.0, + "step": 3922, + "text_loss": 0.6079489588737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007446649832242075, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6325209.0, + "repeat_count": 0.0, + "routers_loss": 0.0035831446293741465, + "skip_count": 1.0, + "step": 3924, + "text_loss": 0.2774808406829834 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007443950074034368, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6327822.0, + "repeat_count": 0.0, + "routers_loss": 0.006809544749557972, + "skip_count": 2.0, + "step": 3926, + "text_loss": 0.48236769437789917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.4414440857059, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007441249379300381, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 6331662.0, + "repeat_count": 1.0, + "routers_loss": 0.023832591250538826, + "skip_count": 2.0, + "step": 3928, + "text_loss": 0.7287537455558777 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0007438547749075028, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 6335801.0, + "repeat_count": 1.0, + "routers_loss": 0.011755098588764668, + "skip_count": 3.0, + "step": 3930, + "text_loss": 0.17253030836582184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0007435845184393577, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6338747.0, + "repeat_count": 1.0, + "routers_loss": 0.005972472485154867, + "skip_count": 0.0, + "step": 3932, + "text_loss": 0.6400216817855835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007433141686291657, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6342772.0, + "repeat_count": 0.0, + "routers_loss": 0.0030393085908144712, + "skip_count": 1.0, + "step": 3934, + "text_loss": 0.6865074038505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0007430437255805252, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6345957.0, + "repeat_count": 0.0, + "routers_loss": 0.0006984061910770833, + "skip_count": 0.0, + "step": 3936, + "text_loss": 0.40398702025413513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.0007427731893970706, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6349162.0, + "repeat_count": 1.0, + "routers_loss": 0.005219762213528156, + "skip_count": 0.0, + "step": 3938, + "text_loss": 0.5951031446456909 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 18.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007425025601824717, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 6352655.0, + "repeat_count": 0.0, + "routers_loss": 0.015575960278511047, + "skip_count": 3.0, + "step": 3940, + "text_loss": 0.26689088344573975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007422318380404346, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6355890.0, + "repeat_count": 0.0, + "routers_loss": 0.0012208883417770267, + "skip_count": 0.0, + "step": 3942, + "text_loss": 0.570725679397583 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0007419610230746999, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6358891.0, + "repeat_count": 1.0, + "routers_loss": 0.0029412026051431894, + "skip_count": 0.0, + "step": 3944, + "text_loss": 0.5521301031112671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0007416901153890448, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6361586.0, + "repeat_count": 0.0, + "routers_loss": 0.0010283910669386387, + "skip_count": 0.0, + "step": 3946, + "text_loss": 0.4046417772769928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007414191150872818, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6364954.0, + "repeat_count": 0.0, + "routers_loss": 0.008222512900829315, + "skip_count": 2.0, + "step": 3948, + "text_loss": 0.2803446352481842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007411480222732583, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6367660.0, + "repeat_count": 0.0, + "routers_loss": 0.001304348581470549, + "skip_count": 0.0, + "step": 3950, + "text_loss": 0.45553359389305115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007408768370508576, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6371585.0, + "repeat_count": 0.0, + "routers_loss": 0.0016345062758773565, + "skip_count": 0.0, + "step": 3952, + "text_loss": 0.25424402952194214 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007406055595239986, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6374365.0, + "repeat_count": 0.0, + "routers_loss": 0.0005097290268167853, + "skip_count": 0.0, + "step": 3954, + "text_loss": 0.5856026411056519 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060546875, + "learning_rate": 0.0007403341897966356, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6377335.0, + "repeat_count": 0.0, + "routers_loss": 0.002482263371348381, + "skip_count": 1.0, + "step": 3956, + "text_loss": 0.5145615339279175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0007400627279727574, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 6380799.0, + "repeat_count": 0.0, + "routers_loss": 0.0011743451468646526, + "skip_count": 0.0, + "step": 3958, + "text_loss": 0.31868961453437805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007397911741563892, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 6383963.0, + "repeat_count": 1.0, + "routers_loss": 0.009861881844699383, + "skip_count": 0.0, + "step": 3960, + "text_loss": 0.21192194521427155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007395195284515905, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6387410.0, + "repeat_count": 1.0, + "routers_loss": 0.004189098719507456, + "skip_count": 0.0, + "step": 3962, + "text_loss": 0.5809708833694458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007392477909624567, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6390670.0, + "repeat_count": 0.0, + "routers_loss": 0.001853612600825727, + "skip_count": 0.0, + "step": 3964, + "text_loss": 0.48985618352890015 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0007389759617931182, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6393609.0, + "repeat_count": 1.0, + "routers_loss": 0.003303771372884512, + "skip_count": 0.0, + "step": 3966, + "text_loss": 0.28729453682899475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.629292632814792, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007387040410477404, + "loss": 0.0058, + "macro_f1": 0.9452888369560242, + "num_tokens": 6396608.0, + "repeat_count": 1.0, + "routers_loss": 0.01791577786207199, + "skip_count": 4.0, + "step": 3968, + "text_loss": 0.30386820435523987 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007384320288305235, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6399793.0, + "repeat_count": 0.0, + "routers_loss": 0.0005771282012574375, + "skip_count": 0.0, + "step": 3970, + "text_loss": 0.47285011410713196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0007381599252457037, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6403365.0, + "repeat_count": 0.0, + "routers_loss": 0.003010645741596818, + "skip_count": 0.0, + "step": 3972, + "text_loss": 0.5313063859939575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000737887730397551, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6406205.0, + "repeat_count": 1.0, + "routers_loss": 0.006457438692450523, + "skip_count": 0.0, + "step": 3974, + "text_loss": 0.2323843240737915 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007376154443903713, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6409552.0, + "repeat_count": 1.0, + "routers_loss": 0.010693981312215328, + "skip_count": 0.0, + "step": 3976, + "text_loss": 0.6304101943969727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.676254769592017, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007373430673285051, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6412386.0, + "repeat_count": 1.0, + "routers_loss": 0.03116440214216709, + "skip_count": 0.0, + "step": 3978, + "text_loss": 0.23448467254638672 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.68564719694746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007370705993163278, + "loss": 0.0111, + "macro_f1": 0.3272727429866791, + "num_tokens": 6416054.0, + "repeat_count": 1.0, + "routers_loss": 0.011973714455962181, + "skip_count": 0.0, + "step": 3980, + "text_loss": 0.6371755599975586 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007367980404582497, + "loss": 0.0105, + "macro_f1": 1.0, + "num_tokens": 6419238.0, + "repeat_count": 1.0, + "routers_loss": 0.005117347463965416, + "skip_count": 2.0, + "step": 3982, + "text_loss": 0.19822923839092255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0007365253908587158, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 6422122.0, + "repeat_count": 0.0, + "routers_loss": 0.0010648667812347412, + "skip_count": 0.0, + "step": 3984, + "text_loss": 0.566700279712677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0007362526506222058, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6425313.0, + "repeat_count": 0.0, + "routers_loss": 0.005726494826376438, + "skip_count": 0.0, + "step": 3986, + "text_loss": 0.6568437814712524 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 18.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0007359798198532343, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 6428422.0, + "repeat_count": 1.0, + "routers_loss": 0.004504100419580936, + "skip_count": 0.0, + "step": 3988, + "text_loss": 0.598754346370697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007357068986563509, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6431512.0, + "repeat_count": 0.0, + "routers_loss": 0.0019837068393826485, + "skip_count": 1.0, + "step": 3990, + "text_loss": 0.7152895927429199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0007354338871361393, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6434358.0, + "repeat_count": 0.0, + "routers_loss": 0.0026031541638076305, + "skip_count": 1.0, + "step": 3992, + "text_loss": 0.4986513555049896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.000735160785397218, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6438175.0, + "repeat_count": 0.0, + "routers_loss": 0.0024831905029714108, + "skip_count": 2.0, + "step": 3994, + "text_loss": 0.4406205713748932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007348875935442401, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6441228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008635876583866775, + "skip_count": 0.0, + "step": 3996, + "text_loss": 0.48884135484695435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007346143116818932, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6444318.0, + "repeat_count": 0.0, + "routers_loss": 0.004007008858025074, + "skip_count": 0.0, + "step": 3998, + "text_loss": 0.6669428944587708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.08203125, + "learning_rate": 0.0007343409399148994, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6448317.0, + "repeat_count": 0.0, + "routers_loss": 0.0031380734872072935, + "skip_count": 0.0, + "step": 4000, + "text_loss": 0.6468493938446045 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0007340674783480154, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 6451673.0, + "repeat_count": 0.0, + "routers_loss": 0.004996029660105705, + "skip_count": 0.0, + "step": 4002, + "text_loss": 0.28135430812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.798356325212797, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007337939270860323, + "loss": 0.009, + "macro_f1": 0.3272727429866791, + "num_tokens": 6456372.0, + "repeat_count": 1.0, + "routers_loss": 0.03784399852156639, + "skip_count": 0.0, + "step": 4004, + "text_loss": 0.41668644547462463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007335202862337753, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6459047.0, + "repeat_count": 0.0, + "routers_loss": 0.0011750755365937948, + "skip_count": 0.0, + "step": 4006, + "text_loss": 0.6853910684585571 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.817141179923688, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.05908203125, + "learning_rate": 0.000733246555896104, + "loss": 0.0062, + "macro_f1": 0.9452888369560242, + "num_tokens": 6462390.0, + "repeat_count": 1.0, + "routers_loss": 0.01630394533276558, + "skip_count": 4.0, + "step": 4008, + "text_loss": 0.7110592126846313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007329727361779124, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6466057.0, + "repeat_count": 0.0, + "routers_loss": 0.0052404399029910564, + "skip_count": 2.0, + "step": 4010, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000732698827184129, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6468878.0, + "repeat_count": 0.0, + "routers_loss": 0.002138581359758973, + "skip_count": 0.0, + "step": 4012, + "text_loss": 0.3999565839767456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000732424829019716, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6472364.0, + "repeat_count": 0.0, + "routers_loss": 0.0037466560024768114, + "skip_count": 0.0, + "step": 4014, + "text_loss": 0.28161346912384033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0007321507417896699, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 6475379.0, + "repeat_count": 0.0, + "routers_loss": 0.0010469373082742095, + "skip_count": 0.0, + "step": 4016, + "text_loss": 1.0490952730178833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007318765655990218, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 6478585.0, + "repeat_count": 0.0, + "routers_loss": 0.009968385100364685, + "skip_count": 2.0, + "step": 4018, + "text_loss": 0.31696680188179016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 18.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007316023005528362, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 6484153.0, + "repeat_count": 0.0, + "routers_loss": 0.002349073765799403, + "skip_count": 1.0, + "step": 4020, + "text_loss": 0.30981555581092834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 18.8828881714118, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007313279467562124, + "loss": 0.0053, + "macro_f1": 0.9452888369560242, + "num_tokens": 6487029.0, + "repeat_count": 1.0, + "routers_loss": 0.011854278855025768, + "skip_count": 4.0, + "step": 4022, + "text_loss": 0.9689550399780273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007310535043142829, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6490315.0, + "repeat_count": 1.0, + "routers_loss": 0.00908346101641655, + "skip_count": 3.0, + "step": 4024, + "text_loss": 0.1705625057220459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007307789733322146, + "loss": 0.0094, + "macro_f1": 0.3333333432674408, + "num_tokens": 6493921.0, + "repeat_count": 0.0, + "routers_loss": 0.0007360641611739993, + "skip_count": 0.0, + "step": 4026, + "text_loss": 0.6252996325492859 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.087890625, + "learning_rate": 0.0007305043539152083, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6496689.0, + "repeat_count": 0.0, + "routers_loss": 0.0017757206223905087, + "skip_count": 0.0, + "step": 4028, + "text_loss": 0.40533265471458435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000730229646168499, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6500090.0, + "repeat_count": 0.0, + "routers_loss": 0.0022657213266938925, + "skip_count": 0.0, + "step": 4030, + "text_loss": 0.25954708456993103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007299548501973548, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 6503023.0, + "repeat_count": 0.0, + "routers_loss": 0.0021747269202023745, + "skip_count": 0.0, + "step": 4032, + "text_loss": 0.6223418712615967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 18.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007296799661070782, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6506382.0, + "repeat_count": 0.0, + "routers_loss": 0.006400502752512693, + "skip_count": 4.0, + "step": 4034, + "text_loss": 0.6873653531074524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0007294049940030055, + "loss": 0.0065, + "macro_f1": 0.3272727429866791, + "num_tokens": 6509194.0, + "repeat_count": 0.0, + "routers_loss": 0.0197185929864645, + "skip_count": 1.0, + "step": 4036, + "text_loss": 0.16156800091266632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007291299339905059, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6512271.0, + "repeat_count": 0.0, + "routers_loss": 0.0009541353792883456, + "skip_count": 0.0, + "step": 4038, + "text_loss": 0.5038442015647888 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007288547861749838, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6516403.0, + "repeat_count": 0.0, + "routers_loss": 0.008226391859352589, + "skip_count": 2.0, + "step": 4040, + "text_loss": 0.3706657588481903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.976812444966246, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007285795506618758, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 6519310.0, + "repeat_count": 0.0, + "routers_loss": 0.017001887783408165, + "skip_count": 1.0, + "step": 4042, + "text_loss": 0.24296723306179047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 18.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0007283042275566528, + "loss": 0.0125, + "macro_f1": 0.6666666865348816, + "num_tokens": 6521979.0, + "repeat_count": 0.0, + "routers_loss": 0.01666323095560074, + "skip_count": 2.0, + "step": 4044, + "text_loss": 0.36904850602149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 18.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007280288169648192, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 6524976.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593175978399813, + "skip_count": 0.0, + "step": 4046, + "text_loss": 0.7312731146812439 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0007277533189919127, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 6528638.0, + "repeat_count": 1.0, + "routers_loss": 0.005652119871228933, + "skip_count": 1.0, + "step": 4048, + "text_loss": 0.23326151072978973 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007274777337435046, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6532193.0, + "repeat_count": 0.0, + "routers_loss": 0.010509157553315163, + "skip_count": 2.0, + "step": 4050, + "text_loss": 0.23918013274669647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0007272020613251999, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6534994.0, + "repeat_count": 0.0, + "routers_loss": 0.002153293928131461, + "skip_count": 0.0, + "step": 4052, + "text_loss": 0.5890526175498962 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007269263018426367, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 6537469.0, + "repeat_count": 1.0, + "routers_loss": 0.0018494052346795797, + "skip_count": 2.0, + "step": 4054, + "text_loss": 0.36058738827705383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0007266504554014866, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6541271.0, + "repeat_count": 0.0, + "routers_loss": 0.0007579320226795971, + "skip_count": 0.0, + "step": 4056, + "text_loss": 0.4089007079601288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.051658350454947, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007263745221074545, + "loss": 0.0086, + "macro_f1": 0.6601307392120361, + "num_tokens": 6544293.0, + "repeat_count": 1.0, + "routers_loss": 0.06202420964837074, + "skip_count": 2.0, + "step": 4058, + "text_loss": 0.2226305454969406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 19.06105077781039, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0007260985020662784, + "loss": 0.0049, + "macro_f1": 0.5934640765190125, + "num_tokens": 6547640.0, + "repeat_count": 0.0, + "routers_loss": 0.044639844447374344, + "skip_count": 3.0, + "step": 4060, + "text_loss": 0.23004353046417236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0007258223953837298, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6550840.0, + "repeat_count": 1.0, + "routers_loss": 0.004215611144900322, + "skip_count": 0.0, + "step": 4062, + "text_loss": 0.2891770601272583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007255462021656132, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6554122.0, + "repeat_count": 0.0, + "routers_loss": 0.0011056234361603856, + "skip_count": 0.0, + "step": 4064, + "text_loss": 0.7485370635986328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007252699225177666, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6557138.0, + "repeat_count": 0.0, + "routers_loss": 0.008258933201432228, + "skip_count": 2.0, + "step": 4066, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007249935565460606, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6560654.0, + "repeat_count": 0.0, + "routers_loss": 0.005102175287902355, + "skip_count": 0.0, + "step": 4068, + "text_loss": 0.5553314089775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0007247171043563994, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 6563814.0, + "repeat_count": 0.0, + "routers_loss": 0.01283820066601038, + "skip_count": 2.0, + "step": 4070, + "text_loss": 0.15729956328868866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007244405660547199, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 6567060.0, + "repeat_count": 0.0, + "routers_loss": 0.0009684927063062787, + "skip_count": 0.0, + "step": 4072, + "text_loss": 0.3725031912326813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.000724163941746992, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6571608.0, + "repeat_count": 0.0, + "routers_loss": 0.0007890827837400138, + "skip_count": 0.0, + "step": 4074, + "text_loss": 0.8438301682472229 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 19.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0007238872315392189, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 6575214.0, + "repeat_count": 1.0, + "routers_loss": 0.0040600355714559555, + "skip_count": 1.0, + "step": 4076, + "text_loss": 0.5923112034797668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007236104355374363, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 6578383.0, + "repeat_count": 0.0, + "routers_loss": 0.0024899677373468876, + "skip_count": 2.0, + "step": 4078, + "text_loss": 0.20302526652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05517578125, + "learning_rate": 0.000723333553847713, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6582175.0, + "repeat_count": 0.0, + "routers_loss": 0.006120906211435795, + "skip_count": 2.0, + "step": 4080, + "text_loss": 0.5400223731994629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007230565865761504, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6585516.0, + "repeat_count": 0.0, + "routers_loss": 0.0029941233806312084, + "skip_count": 0.0, + "step": 4082, + "text_loss": 0.19460804760456085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007227795338288831, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 6588266.0, + "repeat_count": 0.0, + "routers_loss": 0.009357884526252747, + "skip_count": 2.0, + "step": 4084, + "text_loss": 0.35237613320350647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007225023957120782, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6591009.0, + "repeat_count": 0.0, + "routers_loss": 0.0023083325941115618, + "skip_count": 2.0, + "step": 4086, + "text_loss": 0.4336731433868408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0007222251723319356, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 6594472.0, + "repeat_count": 0.0, + "routers_loss": 0.0008416616474278271, + "skip_count": 0.0, + "step": 4088, + "text_loss": 0.6390535831451416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007219478637946877, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6597477.0, + "repeat_count": 0.0, + "routers_loss": 0.004390760324895382, + "skip_count": 1.0, + "step": 4090, + "text_loss": 0.525839626789093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0007216704702065997, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 6600431.0, + "repeat_count": 0.0, + "routers_loss": 0.0010311100631952286, + "skip_count": 0.0, + "step": 4092, + "text_loss": 0.5310423374176025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007213929916739695, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 6603899.0, + "repeat_count": 0.0, + "routers_loss": 0.0032497600186616182, + "skip_count": 1.0, + "step": 4094, + "text_loss": 0.2775326073169708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000721115428303127, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 6606544.0, + "repeat_count": 1.0, + "routers_loss": 0.004692315589636564, + "skip_count": 3.0, + "step": 4096, + "text_loss": 0.6667124032974243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007208377802004353, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6610097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007263485458679497, + "skip_count": 0.0, + "step": 4098, + "text_loss": 0.6916406750679016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0007205600474722897, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6613836.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989488551393151, + "skip_count": 0.0, + "step": 4100, + "text_loss": 0.5257929563522339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000720282230225118, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6616780.0, + "repeat_count": 0.0, + "routers_loss": 0.0011308686807751656, + "skip_count": 1.0, + "step": 4102, + "text_loss": 0.4410906732082367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007200043285653799, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6620110.0, + "repeat_count": 0.0, + "routers_loss": 0.002058265497907996, + "skip_count": 2.0, + "step": 4104, + "text_loss": 0.8581191897392273 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007197263425995681, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 6622585.0, + "repeat_count": 1.0, + "routers_loss": 0.0017528717871755362, + "skip_count": 0.0, + "step": 4106, + "text_loss": 0.5000449419021606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0007194482724342075, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6626356.0, + "repeat_count": 0.0, + "routers_loss": 0.0021995846182107925, + "skip_count": 0.0, + "step": 4108, + "text_loss": 0.401346892118454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007191701181758547, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6629738.0, + "repeat_count": 0.0, + "routers_loss": 0.0014869922306388617, + "skip_count": 0.0, + "step": 4110, + "text_loss": 0.9598422050476074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0007188918799310993, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 6632807.0, + "repeat_count": 0.0, + "routers_loss": 0.0012853415682911873, + "skip_count": 0.0, + "step": 4112, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0007186135578065627, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6636227.0, + "repeat_count": 0.0, + "routers_loss": 0.0009887361666187644, + "skip_count": 0.0, + "step": 4114, + "text_loss": 0.4127283990383148 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007183351519088982, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6639443.0, + "repeat_count": 0.0, + "routers_loss": 0.006282114889472723, + "skip_count": 1.0, + "step": 4116, + "text_loss": 0.20028606057167053 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.333431171118285, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007180566623447917, + "loss": 0.0114, + "macro_f1": 0.6603773832321167, + "num_tokens": 6642127.0, + "repeat_count": 1.0, + "routers_loss": 0.008101986721158028, + "skip_count": 0.0, + "step": 4118, + "text_loss": 0.763931155204773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0007177780892209607, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6645376.0, + "repeat_count": 0.0, + "routers_loss": 0.001953610684722662, + "skip_count": 0.0, + "step": 4120, + "text_loss": 0.42317715287208557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007174994326441551, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6648150.0, + "repeat_count": 0.0, + "routers_loss": 0.003279355587437749, + "skip_count": 0.0, + "step": 4122, + "text_loss": 0.19656142592430115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007172206927211567, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6650935.0, + "repeat_count": 0.0, + "routers_loss": 0.0032076311763375998, + "skip_count": 0.0, + "step": 4124, + "text_loss": 0.13608409464359283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007169418695587791, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 6654464.0, + "repeat_count": 0.0, + "routers_loss": 0.004065621178597212, + "skip_count": 2.0, + "step": 4126, + "text_loss": 0.4882086217403412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0007166629632638678, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6657749.0, + "repeat_count": 0.0, + "routers_loss": 0.0009243001695722342, + "skip_count": 0.0, + "step": 4128, + "text_loss": 0.31632331013679504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0007163839739433003, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6660997.0, + "repeat_count": 0.0, + "routers_loss": 0.0018459554994478822, + "skip_count": 0.0, + "step": 4130, + "text_loss": 0.6123947501182556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.399178162606397, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0007161049017039857, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 6663542.0, + "repeat_count": 2.0, + "routers_loss": 0.030032536014914513, + "skip_count": 2.0, + "step": 4132, + "text_loss": 0.6985659003257751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0007158257466528652, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6666178.0, + "repeat_count": 0.0, + "routers_loss": 0.0013813833938911557, + "skip_count": 0.0, + "step": 4134, + "text_loss": 0.38380664587020874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0007155465088969114, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 6668852.0, + "repeat_count": 0.0, + "routers_loss": 0.00513424864038825, + "skip_count": 3.0, + "step": 4136, + "text_loss": 0.49724283814430237 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0007152671885431288, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 6671430.0, + "repeat_count": 0.0, + "routers_loss": 0.0005165594047866762, + "skip_count": 0.0, + "step": 4138, + "text_loss": 0.666959822177887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007149877856985535, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 6675215.0, + "repeat_count": 0.0, + "routers_loss": 0.001685218419879675, + "skip_count": 0.0, + "step": 4140, + "text_loss": 0.3127259612083435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.000714708300470253, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 6678505.0, + "repeat_count": 0.0, + "routers_loss": 0.004025314934551716, + "skip_count": 0.0, + "step": 4142, + "text_loss": 0.3179470896720886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 19.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007144287329653269, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 6681127.0, + "repeat_count": 1.0, + "routers_loss": 0.005965690594166517, + "skip_count": 0.0, + "step": 4144, + "text_loss": 0.3862907886505127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.464925154094512, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007141490832909058, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6683968.0, + "repeat_count": 0.0, + "routers_loss": 0.012896374799311161, + "skip_count": 1.0, + "step": 4146, + "text_loss": 0.48156118392944336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007138693515541519, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6687196.0, + "repeat_count": 0.0, + "routers_loss": 0.0006367767928168178, + "skip_count": 1.0, + "step": 4148, + "text_loss": 0.676702082157135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0007135895378622592, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 6689972.0, + "repeat_count": 0.0, + "routers_loss": 0.004532640799880028, + "skip_count": 3.0, + "step": 4150, + "text_loss": 0.5865558981895447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.493102436160846, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007133096423224526, + "loss": 0.0081, + "macro_f1": 0.3272727429866791, + "num_tokens": 6693568.0, + "repeat_count": 1.0, + "routers_loss": 0.0377078577876091, + "skip_count": 0.0, + "step": 4152, + "text_loss": 0.2790502607822418 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0007130296650419885, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 6696468.0, + "repeat_count": 0.0, + "routers_loss": 0.004455826710909605, + "skip_count": 1.0, + "step": 4154, + "text_loss": 0.5869500041007996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007127496061281551, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6699307.0, + "repeat_count": 0.0, + "routers_loss": 0.001998464809730649, + "skip_count": 0.0, + "step": 4156, + "text_loss": 0.6931945085525513 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0007124694656882713, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 6702647.0, + "repeat_count": 3.0, + "routers_loss": 0.004117495380342007, + "skip_count": 0.0, + "step": 4158, + "text_loss": 0.4325876832008362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0007121892438296874, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 6705964.0, + "repeat_count": 0.0, + "routers_loss": 0.0014713290147483349, + "skip_count": 0.0, + "step": 4160, + "text_loss": 0.3672060966491699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007119089406597849, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 6710182.0, + "repeat_count": 0.0, + "routers_loss": 0.0037311650812625885, + "skip_count": 1.0, + "step": 4162, + "text_loss": 0.6643805503845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007116285562859767, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 6713410.0, + "repeat_count": 0.0, + "routers_loss": 0.006017287727445364, + "skip_count": 0.0, + "step": 4164, + "text_loss": 0.4606415927410126 + }, + { + "acc_repeat": 0.3333333432674408, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 19.55884942764896, + "f1_execute": 0.9545454382896423, + "f1_repeat": 0.5, + "f1_skip": 1.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007113480908157065, + "loss": 0.0108, + "macro_f1": 0.8181818723678589, + "num_tokens": 6716056.0, + "repeat_count": 3.0, + "routers_loss": 0.08640352636575699, + "skip_count": 4.0, + "step": 4166, + "text_loss": 0.3139408528804779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0007110675443564491, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6719497.0, + "repeat_count": 0.0, + "routers_loss": 0.0012731150491163135, + "skip_count": 0.0, + "step": 4168, + "text_loss": 0.7283861637115479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0007107869170157108, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 6722297.0, + "repeat_count": 0.0, + "routers_loss": 0.0021509863436222076, + "skip_count": 2.0, + "step": 4170, + "text_loss": 0.5767703056335449 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000710506208901028, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 6725762.0, + "repeat_count": 0.0, + "routers_loss": 0.00257494836114347, + "skip_count": 1.0, + "step": 4172, + "text_loss": 0.33571913838386536 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.000710225420119969, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 6728436.0, + "repeat_count": 1.0, + "routers_loss": 0.00943201594054699, + "skip_count": 3.0, + "step": 4174, + "text_loss": 0.6849368810653687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007099445507801323, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 6731427.0, + "repeat_count": 0.0, + "routers_loss": 0.01046718005090952, + "skip_count": 2.0, + "step": 4176, + "text_loss": 0.3346157670021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007096636009891477, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6734800.0, + "repeat_count": 0.0, + "routers_loss": 0.0007813365664333105, + "skip_count": 0.0, + "step": 4178, + "text_loss": 0.49989959597587585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.000709382570854676, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6738244.0, + "repeat_count": 0.0, + "routers_loss": 0.002825600327923894, + "skip_count": 0.0, + "step": 4180, + "text_loss": 0.15744923055171967 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007091014604844078, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6741695.0, + "repeat_count": 0.0, + "routers_loss": 0.0017124463338404894, + "skip_count": 0.0, + "step": 4182, + "text_loss": 0.3752405643463135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007088202699860655, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 6744882.0, + "repeat_count": 1.0, + "routers_loss": 0.005134924780577421, + "skip_count": 3.0, + "step": 4184, + "text_loss": 0.18534569442272186 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.000708538999467402, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6747811.0, + "repeat_count": 0.0, + "routers_loss": 0.002371585462242365, + "skip_count": 1.0, + "step": 4186, + "text_loss": 0.6251029968261719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0007082576490362004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6750765.0, + "repeat_count": 0.0, + "routers_loss": 0.002088436856865883, + "skip_count": 0.0, + "step": 4188, + "text_loss": 0.35471436381340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000707976218800275, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 6754021.0, + "repeat_count": 0.0, + "routers_loss": 0.0012272283202037215, + "skip_count": 0.0, + "step": 4190, + "text_loss": 0.5737302899360657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07763671875, + "learning_rate": 0.0007076947088674701, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6756793.0, + "repeat_count": 0.0, + "routers_loss": 0.0026050808373838663, + "skip_count": 0.0, + "step": 4192, + "text_loss": 0.526336669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.000707413119345661, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6760221.0, + "repeat_count": 0.0, + "routers_loss": 0.0013151296880096197, + "skip_count": 0.0, + "step": 4194, + "text_loss": 0.5678895711898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007071314503427532, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 6763721.0, + "repeat_count": 0.0, + "routers_loss": 0.001528652966953814, + "skip_count": 0.0, + "step": 4196, + "text_loss": 0.7640175223350525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0007068497019666829, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 6768581.0, + "repeat_count": 0.0, + "routers_loss": 0.0019202446565032005, + "skip_count": 0.0, + "step": 4198, + "text_loss": 0.41878414154052734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007065678743254167, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 6772758.0, + "repeat_count": 0.0, + "routers_loss": 0.004667408298701048, + "skip_count": 1.0, + "step": 4200, + "text_loss": 0.3550313413143158 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 19.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007062859675269513, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6776671.0, + "repeat_count": 3.0, + "routers_loss": 0.00568761583417654, + "skip_count": 0.0, + "step": 4202, + "text_loss": 0.1707649976015091 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0007060039816793141, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 6780284.0, + "repeat_count": 0.0, + "routers_loss": 0.0030401297844946384, + "skip_count": 0.0, + "step": 4204, + "text_loss": 0.2686377167701721 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 19.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007057219168905625, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 6783525.0, + "repeat_count": 1.0, + "routers_loss": 0.003353122156113386, + "skip_count": 5.0, + "step": 4206, + "text_loss": 0.5235374569892883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.000705439773268784, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6787691.0, + "repeat_count": 0.0, + "routers_loss": 0.0016532237641513348, + "skip_count": 1.0, + "step": 4208, + "text_loss": 0.5002681612968445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0007051575509220972, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 6790833.0, + "repeat_count": 0.0, + "routers_loss": 0.0011808308772742748, + "skip_count": 0.0, + "step": 4210, + "text_loss": 0.7251001596450806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007048752499586497, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 6794260.0, + "repeat_count": 0.0, + "routers_loss": 0.006246297620236874, + "skip_count": 2.0, + "step": 4212, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.00070459287048662, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6797413.0, + "repeat_count": 0.0, + "routers_loss": 0.0012964420020580292, + "skip_count": 0.0, + "step": 4214, + "text_loss": 0.48889362812042236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007043104126142163, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6800815.0, + "repeat_count": 0.0, + "routers_loss": 0.0018109704833477736, + "skip_count": 0.0, + "step": 4216, + "text_loss": 0.5617026686668396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 19.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0007040278764496771, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 6803937.0, + "repeat_count": 2.0, + "routers_loss": 0.0028699536342173815, + "skip_count": 1.0, + "step": 4218, + "text_loss": 0.548405647277832 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0007037452621012708, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6806946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007951617590151727, + "skip_count": 0.0, + "step": 4220, + "text_loss": 0.5702725648880005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0007034625696772958, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6810083.0, + "repeat_count": 0.0, + "routers_loss": 0.003436052706092596, + "skip_count": 2.0, + "step": 4222, + "text_loss": 0.3898725211620331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00070317979928608, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6812845.0, + "repeat_count": 0.0, + "routers_loss": 0.0005070401239208877, + "skip_count": 0.0, + "step": 4224, + "text_loss": 0.5244157910346985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.000702896951035982, + "loss": 0.0101, + "macro_f1": 0.3272727429866791, + "num_tokens": 6815801.0, + "repeat_count": 0.0, + "routers_loss": 0.01560303382575512, + "skip_count": 1.0, + "step": 4226, + "text_loss": 0.26503118872642517 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0007026140250353896, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 6819464.0, + "repeat_count": 0.0, + "routers_loss": 0.009310240857303143, + "skip_count": 2.0, + "step": 4228, + "text_loss": 0.15597499907016754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0007023310213927208, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 6822657.0, + "repeat_count": 0.0, + "routers_loss": 0.005309136584401131, + "skip_count": 0.0, + "step": 4230, + "text_loss": 0.5271651148796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0007020479402164226, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 6825661.0, + "repeat_count": 0.0, + "routers_loss": 0.005936166271567345, + "skip_count": 2.0, + "step": 4232, + "text_loss": 0.6105108857154846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007017647816149727, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6828688.0, + "repeat_count": 0.0, + "routers_loss": 0.001653556595556438, + "skip_count": 0.0, + "step": 4234, + "text_loss": 0.6966437101364136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000701481545696878, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 6831850.0, + "repeat_count": 0.0, + "routers_loss": 0.0013501866487786174, + "skip_count": 0.0, + "step": 4236, + "text_loss": 1.259678840637207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007011982325706747, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6834862.0, + "repeat_count": 0.0, + "routers_loss": 0.008970130234956741, + "skip_count": 1.0, + "step": 4238, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007009148423449292, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6838148.0, + "repeat_count": 0.0, + "routers_loss": 0.0026013399474322796, + "skip_count": 0.0, + "step": 4240, + "text_loss": 0.291467547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.915761667155856, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0007006313751282371, + "loss": 0.0094, + "macro_f1": 0.3272727429866791, + "num_tokens": 6841142.0, + "repeat_count": 0.0, + "routers_loss": 0.021415632218122482, + "skip_count": 1.0, + "step": 4242, + "text_loss": 0.507606029510498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0007003478310292236, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 6844042.0, + "repeat_count": 0.0, + "routers_loss": 0.0023636550176888704, + "skip_count": 0.0, + "step": 4244, + "text_loss": 0.11626995354890823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.934546521866746, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007000642101565433, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 6847359.0, + "repeat_count": 1.0, + "routers_loss": 0.025154776871204376, + "skip_count": 0.0, + "step": 4246, + "text_loss": 0.42898693680763245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006997805126188803, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6850443.0, + "repeat_count": 0.0, + "routers_loss": 0.00540317315608263, + "skip_count": 0.0, + "step": 4248, + "text_loss": 0.18085283041000366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.000699496738524948, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 6853495.0, + "repeat_count": 0.0, + "routers_loss": 0.0014433214673772454, + "skip_count": 0.0, + "step": 4250, + "text_loss": 0.5524004697799683 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 19.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006992128879834891, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 6856774.0, + "repeat_count": 1.0, + "routers_loss": 0.013381492346525192, + "skip_count": 3.0, + "step": 4252, + "text_loss": 0.19605717062950134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006989289611032758, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 6860313.0, + "repeat_count": 0.0, + "routers_loss": 0.007140172645449638, + "skip_count": 1.0, + "step": 4254, + "text_loss": 0.3182447552680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 19.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006986449579931091, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6863683.0, + "repeat_count": 0.0, + "routers_loss": 0.006486213766038418, + "skip_count": 1.0, + "step": 4256, + "text_loss": 0.19250160455703735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 19.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006983608787618201, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 6867609.0, + "repeat_count": 0.0, + "routers_loss": 0.001465818495489657, + "skip_count": 0.0, + "step": 4258, + "text_loss": 0.5912898182868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.000698076723518268, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 6870040.0, + "repeat_count": 0.0, + "routers_loss": 0.0031106441747397184, + "skip_count": 0.0, + "step": 4260, + "text_loss": 0.13542121648788452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006977924923713418, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6873441.0, + "repeat_count": 0.0, + "routers_loss": 0.0005377951893024147, + "skip_count": 0.0, + "step": 4262, + "text_loss": 0.352464497089386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006975081854299594, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 6876637.0, + "repeat_count": 0.0, + "routers_loss": 0.007052485831081867, + "skip_count": 0.0, + "step": 4264, + "text_loss": 0.5023844242095947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006972238028030678, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 6879928.0, + "repeat_count": 0.0, + "routers_loss": 0.0013608322478830814, + "skip_count": 0.0, + "step": 4266, + "text_loss": 0.8664718270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006969393445996429, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6883425.0, + "repeat_count": 0.0, + "routers_loss": 0.0007607188890688121, + "skip_count": 0.0, + "step": 4268, + "text_loss": 0.5131992101669312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006966548109286897, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 6886790.0, + "repeat_count": 0.0, + "routers_loss": 0.00035804163780994713, + "skip_count": 0.0, + "step": 4270, + "text_loss": 0.5352054834365845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.000696370201899242, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6889747.0, + "repeat_count": 0.0, + "routers_loss": 0.004451376851648092, + "skip_count": 1.0, + "step": 4272, + "text_loss": 0.47865036129951477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006960855176203623, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6892604.0, + "repeat_count": 0.0, + "routers_loss": 0.0015342880506068468, + "skip_count": 0.0, + "step": 4274, + "text_loss": 0.36278650164604187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006958007582011425, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 6895563.0, + "repeat_count": 0.0, + "routers_loss": 0.0022974940948188305, + "skip_count": 2.0, + "step": 4276, + "text_loss": 0.6695618629455566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006955159237507027, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 6898591.0, + "repeat_count": 0.0, + "routers_loss": 0.00859096460044384, + "skip_count": 1.0, + "step": 4278, + "text_loss": 0.44284722208976746 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0006952310143781921, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6903119.0, + "repeat_count": 1.0, + "routers_loss": 0.007919861935079098, + "skip_count": 3.0, + "step": 4280, + "text_loss": 0.5006136298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006949460301927886, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6906394.0, + "repeat_count": 0.0, + "routers_loss": 0.0008476210059598088, + "skip_count": 0.0, + "step": 4282, + "text_loss": 0.8153555989265442 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006946609713036985, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 6909136.0, + "repeat_count": 0.0, + "routers_loss": 0.006711610127240419, + "skip_count": 2.0, + "step": 4284, + "text_loss": 0.43136683106422424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0006943758378201571, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 6912734.0, + "repeat_count": 0.0, + "routers_loss": 0.0038677838165313005, + "skip_count": 0.0, + "step": 4286, + "text_loss": 0.2693749964237213 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0006940906298514278, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 6915838.0, + "repeat_count": 0.0, + "routers_loss": 0.0012188015971332788, + "skip_count": 0.0, + "step": 4288, + "text_loss": 0.5809219479560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006938053475068031, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 6919225.0, + "repeat_count": 0.0, + "routers_loss": 0.001955829095095396, + "skip_count": 0.0, + "step": 4290, + "text_loss": 0.5116089582443237 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006935199908956037, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 6922495.0, + "repeat_count": 1.0, + "routers_loss": 0.0035709093790501356, + "skip_count": 0.0, + "step": 4292, + "text_loss": 0.2745901644229889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006932345601271786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 6925317.0, + "repeat_count": 0.0, + "routers_loss": 0.0005745319649577141, + "skip_count": 0.0, + "step": 4294, + "text_loss": 0.6039219498634338 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 20.169063692398005, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006929490553109056, + "loss": 0.0107, + "macro_f1": 0.9247862696647644, + "num_tokens": 6928054.0, + "repeat_count": 3.0, + "routers_loss": 0.061689916998147964, + "skip_count": 6.0, + "step": 4296, + "text_loss": 0.3904837667942047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006926634765561907, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 6931348.0, + "repeat_count": 0.0, + "routers_loss": 0.002007248578593135, + "skip_count": 0.0, + "step": 4298, + "text_loss": 0.5170742273330688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000692377823972468, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 6934411.0, + "repeat_count": 0.0, + "routers_loss": 0.0005786226247437298, + "skip_count": 0.0, + "step": 4300, + "text_loss": 0.8032443523406982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.19724097446434, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006920920976692004, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 6938153.0, + "repeat_count": 1.0, + "routers_loss": 0.024602646008133888, + "skip_count": 0.0, + "step": 4302, + "text_loss": 0.446534663438797 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006918062977558784, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6940731.0, + "repeat_count": 0.0, + "routers_loss": 0.005759815219789743, + "skip_count": 2.0, + "step": 4304, + "text_loss": 0.15479247272014618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006915204243420214, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 6943246.0, + "repeat_count": 0.0, + "routers_loss": 0.005315347574651241, + "skip_count": 1.0, + "step": 4306, + "text_loss": 0.22127842903137207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006912344775371765, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 6947197.0, + "repeat_count": 0.0, + "routers_loss": 0.0012061651796102524, + "skip_count": 0.0, + "step": 4308, + "text_loss": 0.7058854103088379 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006909484574509191, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 6951817.0, + "repeat_count": 0.0, + "routers_loss": 0.0029203309677541256, + "skip_count": 0.0, + "step": 4310, + "text_loss": 0.6014000773429871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0006906623641928525, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 6955094.0, + "repeat_count": 0.0, + "routers_loss": 0.005703397560864687, + "skip_count": 2.0, + "step": 4312, + "text_loss": 0.5923848152160645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006903761978726084, + "loss": 0.0073, + "macro_f1": 1.0, + "num_tokens": 6958127.0, + "repeat_count": 1.0, + "routers_loss": 0.004489895887672901, + "skip_count": 2.0, + "step": 4314, + "text_loss": 0.36911651492118835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.000690089958599846, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 6960871.0, + "repeat_count": 0.0, + "routers_loss": 0.003871412482112646, + "skip_count": 2.0, + "step": 4316, + "text_loss": 0.442545086145401 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.000689803646484253, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 6963980.0, + "repeat_count": 1.0, + "routers_loss": 0.008667866699397564, + "skip_count": 2.0, + "step": 4318, + "text_loss": 0.1987489014863968 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006895172616355446, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 6967132.0, + "repeat_count": 1.0, + "routers_loss": 0.00843339879065752, + "skip_count": 0.0, + "step": 4320, + "text_loss": 0.48267918825149536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006892308041634639, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 6969971.0, + "repeat_count": 0.0, + "routers_loss": 0.0004312851815484464, + "skip_count": 0.0, + "step": 4322, + "text_loss": 0.3662732243537903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006889442741777822, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 6973114.0, + "repeat_count": 0.0, + "routers_loss": 0.004588035400956869, + "skip_count": 3.0, + "step": 4324, + "text_loss": 0.6707104444503784 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.309950102729672, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006886576717882982, + "loss": 0.0057, + "macro_f1": 0.8817967176437378, + "num_tokens": 6976013.0, + "repeat_count": 2.0, + "routers_loss": 0.0687296912074089, + "skip_count": 3.0, + "step": 4326, + "text_loss": 0.1662217676639557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006883709971048384, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6979200.0, + "repeat_count": 0.0, + "routers_loss": 0.002950174268335104, + "skip_count": 0.0, + "step": 4328, + "text_loss": 0.21168152987957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006880842502372572, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 6982640.0, + "repeat_count": 0.0, + "routers_loss": 0.0032158740796148777, + "skip_count": 0.0, + "step": 4330, + "text_loss": 0.26790961623191833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0006877974312954365, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 6985917.0, + "repeat_count": 0.0, + "routers_loss": 0.0005083635332994163, + "skip_count": 0.0, + "step": 4332, + "text_loss": 0.9736502170562744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.347519812151454, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.000687510540389286, + "loss": 0.0053, + "macro_f1": 0.32098764181137085, + "num_tokens": 6988388.0, + "repeat_count": 0.0, + "routers_loss": 0.03473830223083496, + "skip_count": 2.0, + "step": 4334, + "text_loss": 0.21662230789661407 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006872235776287425, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 6991360.0, + "repeat_count": 0.0, + "routers_loss": 0.002206524135544896, + "skip_count": 0.0, + "step": 4336, + "text_loss": 0.6026972532272339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006869365431237711, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 6995080.0, + "repeat_count": 1.0, + "routers_loss": 0.000969731598161161, + "skip_count": 0.0, + "step": 4338, + "text_loss": 0.5833017230033875 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.375697094217788, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006866494369843635, + "loss": 0.0054, + "macro_f1": 0.8820862174034119, + "num_tokens": 6998526.0, + "repeat_count": 2.0, + "routers_loss": 0.013962293043732643, + "skip_count": 2.0, + "step": 4340, + "text_loss": 0.41465985774993896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0006863622593205397, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7001494.0, + "repeat_count": 0.0, + "routers_loss": 0.0064964210614562035, + "skip_count": 3.0, + "step": 4342, + "text_loss": 0.3774271011352539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.394481948928675, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006860750102423464, + "loss": 0.0062, + "macro_f1": 0.6589147448539734, + "num_tokens": 7005544.0, + "repeat_count": 1.0, + "routers_loss": 0.023250726982951164, + "skip_count": 6.0, + "step": 4344, + "text_loss": 0.2732464373111725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006857876898598582, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 7008847.0, + "repeat_count": 0.0, + "routers_loss": 0.0038170060142874718, + "skip_count": 2.0, + "step": 4346, + "text_loss": 0.29610875248908997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006855002982831769, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7012577.0, + "repeat_count": 0.0, + "routers_loss": 0.0012856025714427233, + "skip_count": 0.0, + "step": 4348, + "text_loss": 0.6098502278327942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006852128356224314, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7015650.0, + "repeat_count": 0.0, + "routers_loss": 0.008162742480635643, + "skip_count": 1.0, + "step": 4350, + "text_loss": 0.20868146419525146 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.432051658350456, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0006849253019877778, + "loss": 0.0074, + "macro_f1": 0.8817967176437378, + "num_tokens": 7019925.0, + "repeat_count": 2.0, + "routers_loss": 0.023544032126665115, + "skip_count": 3.0, + "step": 4352, + "text_loss": 0.628226101398468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006846376974893996, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 7023130.0, + "repeat_count": 0.0, + "routers_loss": 0.004982319660484791, + "skip_count": 2.0, + "step": 4354, + "text_loss": 0.7037544250488281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006843500222375074, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7026422.0, + "repeat_count": 1.0, + "routers_loss": 0.004015266429632902, + "skip_count": 0.0, + "step": 4356, + "text_loss": 0.22352729737758636 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.46022894041679, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006840622763423391, + "loss": 0.0071, + "macro_f1": 0.9449735879898071, + "num_tokens": 7029077.0, + "repeat_count": 2.0, + "routers_loss": 0.021162014454603195, + "skip_count": 4.0, + "step": 4358, + "text_loss": 0.2431403249502182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006837744599141591, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7032582.0, + "repeat_count": 0.0, + "routers_loss": 0.0007044129306450486, + "skip_count": 0.0, + "step": 4360, + "text_loss": 0.26667487621307373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006834865730632594, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7035642.0, + "repeat_count": 0.0, + "routers_loss": 0.0067853196524083614, + "skip_count": 1.0, + "step": 4362, + "text_loss": 0.20965275168418884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006831986158999588, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7038601.0, + "repeat_count": 0.0, + "routers_loss": 0.00899333506822586, + "skip_count": 2.0, + "step": 4364, + "text_loss": 0.26860126852989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000682910588534603, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7042274.0, + "repeat_count": 0.0, + "routers_loss": 0.0019194348715245724, + "skip_count": 0.0, + "step": 4366, + "text_loss": 0.14046810567378998 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0006826224910775647, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7045268.0, + "repeat_count": 1.0, + "routers_loss": 0.006915684789419174, + "skip_count": 3.0, + "step": 4368, + "text_loss": 0.5900366306304932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006823343236392432, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7049407.0, + "repeat_count": 0.0, + "routers_loss": 0.001678116386756301, + "skip_count": 0.0, + "step": 4370, + "text_loss": 0.7868026494979858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000682046086330065, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7052783.0, + "repeat_count": 0.0, + "routers_loss": 0.0003459530707914382, + "skip_count": 0.0, + "step": 4372, + "text_loss": 0.6349637508392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006817577792604831, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7055757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011729507241398096, + "skip_count": 0.0, + "step": 4374, + "text_loss": 0.43258991837501526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006814694025409773, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 7058684.0, + "repeat_count": 0.0, + "routers_loss": 0.0006664610700681806, + "skip_count": 0.0, + "step": 4376, + "text_loss": 0.5307940244674683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.091796875, + "learning_rate": 0.0006811809562820542, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7061902.0, + "repeat_count": 0.0, + "routers_loss": 0.004595907870680094, + "skip_count": 2.0, + "step": 4378, + "text_loss": 0.5830042362213135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006808924405942467, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7065100.0, + "repeat_count": 0.0, + "routers_loss": 0.0032026609405875206, + "skip_count": 0.0, + "step": 4380, + "text_loss": 0.20797798037528992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0006806038555881148, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 7068556.0, + "repeat_count": 1.0, + "routers_loss": 0.0024626904632896185, + "skip_count": 0.0, + "step": 4382, + "text_loss": 0.5791074633598328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006803152013742448, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 7071284.0, + "repeat_count": 1.0, + "routers_loss": 0.010723610408604145, + "skip_count": 2.0, + "step": 4384, + "text_loss": 0.13227243721485138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006800264780632495, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7074428.0, + "repeat_count": 1.0, + "routers_loss": 0.0011231007520109415, + "skip_count": 0.0, + "step": 4386, + "text_loss": 0.4360627233982086 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006797376857657681, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 7078313.0, + "repeat_count": 2.0, + "routers_loss": 0.008419238030910492, + "skip_count": 1.0, + "step": 4388, + "text_loss": 0.5183924436569214 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006794488245924664, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7081258.0, + "repeat_count": 1.0, + "routers_loss": 0.006582668516784906, + "skip_count": 3.0, + "step": 4390, + "text_loss": 0.2797473669052124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006791598946540368, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7084527.0, + "repeat_count": 0.0, + "routers_loss": 0.00557357631623745, + "skip_count": 2.0, + "step": 4392, + "text_loss": 0.39495575428009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006788708960611975, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7087675.0, + "repeat_count": 0.0, + "routers_loss": 0.007155992556363344, + "skip_count": 0.0, + "step": 4394, + "text_loss": 0.3785299062728882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0006785818289246934, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7090171.0, + "repeat_count": 0.0, + "routers_loss": 0.0009265039698220789, + "skip_count": 0.0, + "step": 4396, + "text_loss": 0.42634522914886475 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 20.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006782926933552955, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 7092529.0, + "repeat_count": 1.0, + "routers_loss": 0.008679097518324852, + "skip_count": 7.0, + "step": 4398, + "text_loss": 0.4283660054206848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006780034894638014, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7095141.0, + "repeat_count": 0.0, + "routers_loss": 0.002363949315622449, + "skip_count": 0.0, + "step": 4400, + "text_loss": 0.481539249420166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.000677714217361034, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7098208.0, + "repeat_count": 0.0, + "routers_loss": 0.004005146212875843, + "skip_count": 3.0, + "step": 4402, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006774248771578435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7101681.0, + "repeat_count": 0.0, + "routers_loss": 0.0026864963583648205, + "skip_count": 0.0, + "step": 4404, + "text_loss": 0.16315312683582306 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 27.0, + "epoch": 20.68564719694746, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006771354689651054, + "loss": 0.005, + "macro_f1": 0.9449735879898071, + "num_tokens": 7104719.0, + "repeat_count": 2.0, + "routers_loss": 0.02719845622777939, + "skip_count": 4.0, + "step": 4406, + "text_loss": 0.37855592370033264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006768459928937213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7108697.0, + "repeat_count": 0.0, + "routers_loss": 0.010488593950867653, + "skip_count": 0.0, + "step": 4408, + "text_loss": 0.23133711516857147 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 20.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0006765564490546193, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7111426.0, + "repeat_count": 1.0, + "routers_loss": 0.0013637891970574856, + "skip_count": 0.0, + "step": 4410, + "text_loss": 0.41399383544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006762668375587528, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7114241.0, + "repeat_count": 0.0, + "routers_loss": 0.000900395680218935, + "skip_count": 0.0, + "step": 4412, + "text_loss": 0.6460412740707397 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006759771585171016, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7117031.0, + "repeat_count": 0.0, + "routers_loss": 0.0024001260753721, + "skip_count": 0.0, + "step": 4414, + "text_loss": 0.7645824551582336 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006756874120406714, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 7120766.0, + "repeat_count": 3.0, + "routers_loss": 0.005034091416746378, + "skip_count": 4.0, + "step": 4416, + "text_loss": 0.31753066182136536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006753975982404934, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7125243.0, + "repeat_count": 0.0, + "routers_loss": 0.002483269665390253, + "skip_count": 0.0, + "step": 4418, + "text_loss": 0.5304268002510071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0006751077172276249, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 7127795.0, + "repeat_count": 0.0, + "routers_loss": 0.02676006779074669, + "skip_count": 1.0, + "step": 4420, + "text_loss": 0.22011354565620422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.000674817769113149, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7130837.0, + "repeat_count": 0.0, + "routers_loss": 0.003267093561589718, + "skip_count": 2.0, + "step": 4422, + "text_loss": 0.2906076908111572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 20.770179043146463, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.027099609375, + "learning_rate": 0.000674527754008174, + "loss": 0.0045, + "macro_f1": 0.5934640765190125, + "num_tokens": 7135090.0, + "repeat_count": 0.0, + "routers_loss": 0.022510390728712082, + "skip_count": 3.0, + "step": 4424, + "text_loss": 0.2544902563095093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006742376720238345, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 7138751.0, + "repeat_count": 0.0, + "routers_loss": 0.0011178571730852127, + "skip_count": 0.0, + "step": 4426, + "text_loss": 0.6811438798904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 20.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006739475232712904, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7141762.0, + "repeat_count": 2.0, + "routers_loss": 0.005595206283032894, + "skip_count": 1.0, + "step": 4428, + "text_loss": 0.38743990659713745 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006736573078617272, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7145235.0, + "repeat_count": 0.0, + "routers_loss": 0.002793942578136921, + "skip_count": 2.0, + "step": 4430, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 20.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0006733670259063561, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7149042.0, + "repeat_count": 0.0, + "routers_loss": 0.006146818865090609, + "skip_count": 3.0, + "step": 4432, + "text_loss": 0.17822015285491943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 20.817141179923688, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006730766775164136, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 7152166.0, + "repeat_count": 0.0, + "routers_loss": 0.026045087724924088, + "skip_count": 2.0, + "step": 4434, + "text_loss": 0.2910420000553131 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 20.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006727862628031618, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7155506.0, + "repeat_count": 2.0, + "routers_loss": 0.0022973387967795134, + "skip_count": 0.0, + "step": 4436, + "text_loss": 0.3502544164657593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006724957818778882, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7158739.0, + "repeat_count": 0.0, + "routers_loss": 0.002357073128223419, + "skip_count": 1.0, + "step": 4438, + "text_loss": 0.26200664043426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0006722052348519054, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 7161776.0, + "repeat_count": 0.0, + "routers_loss": 0.0005521026905626059, + "skip_count": 0.0, + "step": 4440, + "text_loss": 0.3922915458679199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 20.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000671914621836552, + "loss": 0.0106, + "macro_f1": 0.6666666865348816, + "num_tokens": 7164763.0, + "repeat_count": 0.0, + "routers_loss": 0.007691344246268272, + "skip_count": 2.0, + "step": 4442, + "text_loss": 0.6021351218223572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000671623942943191, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7167924.0, + "repeat_count": 0.0, + "routers_loss": 0.0032181134447455406, + "skip_count": 0.0, + "step": 4444, + "text_loss": 0.23639555275440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0006713331982832113, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 7170743.0, + "repeat_count": 1.0, + "routers_loss": 0.024979131296277046, + "skip_count": 0.0, + "step": 4446, + "text_loss": 0.4957772493362427 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006710423879680271, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7174660.0, + "repeat_count": 0.0, + "routers_loss": 0.002571308286860585, + "skip_count": 0.0, + "step": 4448, + "text_loss": 0.47968071699142456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000670751512109077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7177965.0, + "repeat_count": 0.0, + "routers_loss": 0.00212799571454525, + "skip_count": 0.0, + "step": 4450, + "text_loss": 0.6550716161727905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006704605708178252, + "loss": 0.0107, + "macro_f1": 0.6666666865348816, + "num_tokens": 7181512.0, + "repeat_count": 0.0, + "routers_loss": 0.004176430404186249, + "skip_count": 1.0, + "step": 4452, + "text_loss": 0.36959558725357056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0006701695642057613, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7184555.0, + "repeat_count": 0.0, + "routers_loss": 0.0010968588758260012, + "skip_count": 0.0, + "step": 4454, + "text_loss": 0.6686749458312988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006698784923843993, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7187474.0, + "repeat_count": 0.0, + "routers_loss": 0.0014241471653804183, + "skip_count": 0.0, + "step": 4456, + "text_loss": 0.6147221922874451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006695873554652784, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7190649.0, + "repeat_count": 0.0, + "routers_loss": 0.008801907300949097, + "skip_count": 0.0, + "step": 4458, + "text_loss": 0.26381927728652954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006692961535599634, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7193961.0, + "repeat_count": 0.0, + "routers_loss": 0.009027508087456226, + "skip_count": 1.0, + "step": 4460, + "text_loss": 0.1926470547914505 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006690048867800427, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7197456.0, + "repeat_count": 0.0, + "routers_loss": 0.0022697453387081623, + "skip_count": 0.0, + "step": 4462, + "text_loss": 0.6736721992492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006687135552371305, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7200290.0, + "repeat_count": 0.0, + "routers_loss": 0.006747903767973185, + "skip_count": 1.0, + "step": 4464, + "text_loss": 0.2026437371969223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006684221590428657, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7203320.0, + "repeat_count": 0.0, + "routers_loss": 0.0011565096210688353, + "skip_count": 0.0, + "step": 4466, + "text_loss": 0.7587730288505554 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 20.976812444966246, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006681306983089121, + "loss": 0.0083, + "macro_f1": 0.8820862174034119, + "num_tokens": 7206411.0, + "repeat_count": 2.0, + "routers_loss": 0.023645581677556038, + "skip_count": 2.0, + "step": 4468, + "text_loss": 0.8981561660766602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 20.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006678391731469575, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7209421.0, + "repeat_count": 0.0, + "routers_loss": 0.0035848666448146105, + "skip_count": 0.0, + "step": 4470, + "text_loss": 0.1522839516401291 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 20.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006675475836687152, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 7212267.0, + "repeat_count": 1.0, + "routers_loss": 0.005046425387263298, + "skip_count": 1.0, + "step": 4472, + "text_loss": 0.46007999777793884 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006672559299859228, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7215195.0, + "repeat_count": 0.0, + "routers_loss": 0.0019333874806761742, + "skip_count": 0.0, + "step": 4474, + "text_loss": 1.0859547853469849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006669642122103423, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7217941.0, + "repeat_count": 0.0, + "routers_loss": 0.0005401032394729555, + "skip_count": 0.0, + "step": 4476, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.023481068388612, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006666724304537611, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 7222494.0, + "repeat_count": 1.0, + "routers_loss": 0.015569722279906273, + "skip_count": 0.0, + "step": 4478, + "text_loss": 0.2896423637866974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006663805848279898, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7225292.0, + "repeat_count": 0.0, + "routers_loss": 0.0020135147497057915, + "skip_count": 0.0, + "step": 4480, + "text_loss": 0.8492724299430847 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006660886754448648, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 7229184.0, + "repeat_count": 1.0, + "routers_loss": 0.002355351345613599, + "skip_count": 0.0, + "step": 4482, + "text_loss": 0.189764603972435 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006657967024162459, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7232906.0, + "repeat_count": 0.0, + "routers_loss": 0.003044391982257366, + "skip_count": 0.0, + "step": 4484, + "text_loss": 0.4239847660064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006655046658540179, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7235996.0, + "repeat_count": 0.0, + "routers_loss": 0.00602696230635047, + "skip_count": 2.0, + "step": 4486, + "text_loss": 0.217103973031044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0006652125658700896, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 7238882.0, + "repeat_count": 0.0, + "routers_loss": 0.001470155781134963, + "skip_count": 1.0, + "step": 4488, + "text_loss": 0.6090770363807678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006649204025763945, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7241815.0, + "repeat_count": 1.0, + "routers_loss": 0.008737480267882347, + "skip_count": 2.0, + "step": 4490, + "text_loss": 0.48314425349235535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0006646281760848902, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7244848.0, + "repeat_count": 0.0, + "routers_loss": 0.0008257135050371289, + "skip_count": 0.0, + "step": 4492, + "text_loss": 0.5884748101234436 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006643358865075581, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7247930.0, + "repeat_count": 0.0, + "routers_loss": 0.0016262239078059793, + "skip_count": 0.0, + "step": 4494, + "text_loss": 0.21444730460643768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006640435339564042, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7251776.0, + "repeat_count": 0.0, + "routers_loss": 0.001315156347118318, + "skip_count": 0.0, + "step": 4496, + "text_loss": 0.6890370845794678 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006637511185434588, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7255070.0, + "repeat_count": 1.0, + "routers_loss": 0.007614497095346451, + "skip_count": 3.0, + "step": 4498, + "text_loss": 0.516417920589447 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 21.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0006634586403807758, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7258115.0, + "repeat_count": 3.0, + "routers_loss": 0.004906686954200268, + "skip_count": 2.0, + "step": 4500, + "text_loss": 0.577463686466217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.13619019665395, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006631660995804334, + "loss": 0.0067, + "macro_f1": 0.6601307392120361, + "num_tokens": 7260769.0, + "repeat_count": 1.0, + "routers_loss": 0.013337121345102787, + "skip_count": 2.0, + "step": 4502, + "text_loss": 0.37124839425086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006628734962545339, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7263908.0, + "repeat_count": 0.0, + "routers_loss": 0.0023418180644512177, + "skip_count": 0.0, + "step": 4504, + "text_loss": 0.17937727272510529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006625808305152033, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7267391.0, + "repeat_count": 0.0, + "routers_loss": 0.0006556165171787143, + "skip_count": 0.0, + "step": 4506, + "text_loss": 0.45344987511634827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006622881024745919, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 7271402.0, + "repeat_count": 0.0, + "routers_loss": 0.0021988123189657927, + "skip_count": 0.0, + "step": 4508, + "text_loss": 0.5842905640602112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006619953122448734, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7274354.0, + "repeat_count": 0.0, + "routers_loss": 0.00774174090474844, + "skip_count": 2.0, + "step": 4510, + "text_loss": 0.27159228920936584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006617024599382456, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7277378.0, + "repeat_count": 0.0, + "routers_loss": 0.0006942499312572181, + "skip_count": 0.0, + "step": 4512, + "text_loss": 0.4464176297187805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006614095456669302, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7280526.0, + "repeat_count": 0.0, + "routers_loss": 0.003003394464030862, + "skip_count": 0.0, + "step": 4514, + "text_loss": 0.31188079714775085 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006611165695431725, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7283916.0, + "repeat_count": 0.0, + "routers_loss": 0.0006948060472495854, + "skip_count": 0.0, + "step": 4516, + "text_loss": 0.5266574025154114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006608235316792413, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7286843.0, + "repeat_count": 0.0, + "routers_loss": 0.0014080886030569673, + "skip_count": 0.0, + "step": 4518, + "text_loss": 0.5880120396614075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006605304321874295, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7289940.0, + "repeat_count": 0.0, + "routers_loss": 0.0016894340515136719, + "skip_count": 0.0, + "step": 4520, + "text_loss": 0.6623797416687012 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006602372711800531, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7292869.0, + "repeat_count": 0.0, + "routers_loss": 0.003522444050759077, + "skip_count": 0.0, + "step": 4522, + "text_loss": 0.5488807559013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006599440487694521, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 7296618.0, + "repeat_count": 0.0, + "routers_loss": 0.0011981099378317595, + "skip_count": 0.0, + "step": 4524, + "text_loss": 0.4128517210483551 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.248899324919282, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00065965076506799, + "loss": 0.0047, + "macro_f1": 0.9262410998344421, + "num_tokens": 7300481.0, + "repeat_count": 3.0, + "routers_loss": 0.010548194870352745, + "skip_count": 2.0, + "step": 4526, + "text_loss": 0.26450902223587036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006593574201880536, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7303272.0, + "repeat_count": 0.0, + "routers_loss": 0.005642973352223635, + "skip_count": 1.0, + "step": 4528, + "text_loss": 0.35269856452941895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000659064014242053, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 7306615.0, + "repeat_count": 0.0, + "routers_loss": 0.004171932581812143, + "skip_count": 1.0, + "step": 4530, + "text_loss": 0.18814080953598022 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006587705473424223, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7310368.0, + "repeat_count": 0.0, + "routers_loss": 0.002289367141202092, + "skip_count": 2.0, + "step": 4532, + "text_loss": 0.7363705635070801 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000658477019601618, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 7313788.0, + "repeat_count": 0.0, + "routers_loss": 0.004440625663846731, + "skip_count": 1.0, + "step": 4534, + "text_loss": 0.8126176595687866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006581834311321211, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 7317864.0, + "repeat_count": 0.0, + "routers_loss": 0.0013160990783944726, + "skip_count": 2.0, + "step": 4536, + "text_loss": 0.7015916109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.000657889782046435, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7320693.0, + "repeat_count": 0.0, + "routers_loss": 0.0032275544945150614, + "skip_count": 2.0, + "step": 4538, + "text_loss": 0.6481677293777466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.314646316407398, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0006575960724570865, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 7324335.0, + "repeat_count": 0.0, + "routers_loss": 0.009769129566848278, + "skip_count": 1.0, + "step": 4540, + "text_loss": 0.22194676101207733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006573023024766258, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 7327431.0, + "repeat_count": 2.0, + "routers_loss": 0.0036973082460463047, + "skip_count": 4.0, + "step": 4542, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.000657008472217626, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7330262.0, + "repeat_count": 0.0, + "routers_loss": 0.0007046440150588751, + "skip_count": 0.0, + "step": 4544, + "text_loss": 0.2649917006492615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006567145817926836, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7333110.0, + "repeat_count": 0.0, + "routers_loss": 0.0026714997366070747, + "skip_count": 0.0, + "step": 4546, + "text_loss": 0.5490524768829346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006564206313144175, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7336101.0, + "repeat_count": 0.0, + "routers_loss": 0.006552211008965969, + "skip_count": 0.0, + "step": 4548, + "text_loss": 0.14098678529262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0006561266208954707, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7339435.0, + "repeat_count": 0.0, + "routers_loss": 0.0035560601390898228, + "skip_count": 2.0, + "step": 4550, + "text_loss": 0.20412275195121765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006558325506485081, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7342609.0, + "repeat_count": 0.0, + "routers_loss": 0.0020106974989175797, + "skip_count": 1.0, + "step": 4552, + "text_loss": 0.6184256076812744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006555384206862183, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 7345614.0, + "repeat_count": 0.0, + "routers_loss": 0.0014235252747312188, + "skip_count": 0.0, + "step": 4554, + "text_loss": 1.0108838081359863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.389785735250953, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006552442311213121, + "loss": 0.0041, + "macro_f1": 0.3272727429866791, + "num_tokens": 7348957.0, + "repeat_count": 1.0, + "routers_loss": 0.01703745685517788, + "skip_count": 0.0, + "step": 4556, + "text_loss": 0.21315747499465942 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.399178162606397, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0006549499820665237, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 7352724.0, + "repeat_count": 0.0, + "routers_loss": 0.013315381482243538, + "skip_count": 3.0, + "step": 4558, + "text_loss": 0.34369465708732605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00065465567363461, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7356592.0, + "repeat_count": 0.0, + "routers_loss": 0.0017354936571791768, + "skip_count": 0.0, + "step": 4560, + "text_loss": 0.6267461180686951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006543613059383503, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7359774.0, + "repeat_count": 0.0, + "routers_loss": 0.011646085418760777, + "skip_count": 2.0, + "step": 4562, + "text_loss": 0.4400193989276886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006540668790905471, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7362765.0, + "repeat_count": 0.0, + "routers_loss": 0.0019345436012372375, + "skip_count": 0.0, + "step": 4564, + "text_loss": 0.49204275012016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006537723932040251, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7366337.0, + "repeat_count": 0.0, + "routers_loss": 0.00562885170802474, + "skip_count": 1.0, + "step": 4566, + "text_loss": 0.22566382586956024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006534778483916319, + "loss": 0.0084, + "macro_f1": 1.0, + "num_tokens": 7369851.0, + "repeat_count": 2.0, + "routers_loss": 0.005508176051080227, + "skip_count": 2.0, + "step": 4568, + "text_loss": 0.8057850003242493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006531832447662377, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7373918.0, + "repeat_count": 0.0, + "routers_loss": 0.006460923235863447, + "skip_count": 2.0, + "step": 4570, + "text_loss": 0.5141497254371643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006528885824407351, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7376674.0, + "repeat_count": 0.0, + "routers_loss": 0.0032120654359459877, + "skip_count": 0.0, + "step": 4572, + "text_loss": 0.1281338930130005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006525938615280394, + "loss": 0.0116, + "macro_f1": 0.3333333432674408, + "num_tokens": 7379791.0, + "repeat_count": 0.0, + "routers_loss": 0.00443810923025012, + "skip_count": 0.0, + "step": 4574, + "text_loss": 0.268352210521698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.000652299082141088, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7382886.0, + "repeat_count": 0.0, + "routers_loss": 0.008284369483590126, + "skip_count": 2.0, + "step": 4576, + "text_loss": 0.30193832516670227 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.493102436160846, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006520042443928411, + "loss": 0.0068, + "macro_f1": 0.8823530077934265, + "num_tokens": 7386036.0, + "repeat_count": 2.0, + "routers_loss": 0.03383317217230797, + "skip_count": 1.0, + "step": 4578, + "text_loss": 0.23106542229652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000651709348396281, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7388908.0, + "repeat_count": 0.0, + "routers_loss": 0.0017075951909646392, + "skip_count": 1.0, + "step": 4580, + "text_loss": 0.386099249124527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006514143942644124, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7392004.0, + "repeat_count": 0.0, + "routers_loss": 0.009516917169094086, + "skip_count": 1.0, + "step": 4582, + "text_loss": 0.3162059485912323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511193821102623, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 7395538.0, + "repeat_count": 0.0, + "routers_loss": 0.0031392278615385294, + "skip_count": 0.0, + "step": 4584, + "text_loss": 0.5536221861839294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006508243120468799, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7398461.0, + "repeat_count": 0.0, + "routers_loss": 0.0014138511614874005, + "skip_count": 0.0, + "step": 4586, + "text_loss": 0.7934318780899048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006505291841873367, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7401611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265916115604341, + "skip_count": 0.0, + "step": 4588, + "text_loss": 0.4569905698299408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.000650233998644726, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7404641.0, + "repeat_count": 0.0, + "routers_loss": 0.0024988956283777952, + "skip_count": 0.0, + "step": 4590, + "text_loss": 0.49998772144317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0006499387555321636, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7407574.0, + "repeat_count": 0.0, + "routers_loss": 0.004110113717615604, + "skip_count": 1.0, + "step": 4592, + "text_loss": 0.5679413676261902 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006496434549627874, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7410806.0, + "repeat_count": 0.0, + "routers_loss": 0.0032845588866621256, + "skip_count": 0.0, + "step": 4594, + "text_loss": 0.35515281558036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006493480970497568, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7413402.0, + "repeat_count": 0.0, + "routers_loss": 0.010577172972261906, + "skip_count": 1.0, + "step": 4596, + "text_loss": 0.26111698150634766 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006490526819062537, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 7417236.0, + "repeat_count": 1.0, + "routers_loss": 0.002054794691503048, + "skip_count": 2.0, + "step": 4598, + "text_loss": 0.6480993628501892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006487572096454818, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7420278.0, + "repeat_count": 0.0, + "routers_loss": 0.0017989084590226412, + "skip_count": 0.0, + "step": 4600, + "text_loss": 0.4935401678085327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006484616803806665, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7423866.0, + "repeat_count": 0.0, + "routers_loss": 0.006671485956758261, + "skip_count": 1.0, + "step": 4602, + "text_loss": 0.15030258893966675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0006481660942250552, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7426884.0, + "repeat_count": 0.0, + "routers_loss": 0.008334980346262455, + "skip_count": 3.0, + "step": 4604, + "text_loss": 0.29933279752731323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006478704512919173, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7431017.0, + "repeat_count": 0.0, + "routers_loss": 0.011923984624445438, + "skip_count": 3.0, + "step": 4606, + "text_loss": 0.35141825675964355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0006475747516945432, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7434406.0, + "repeat_count": 0.0, + "routers_loss": 0.0031092462595552206, + "skip_count": 3.0, + "step": 4608, + "text_loss": 0.21021464467048645 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000647278995546246, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7437204.0, + "repeat_count": 1.0, + "routers_loss": 0.0006713552866131067, + "skip_count": 0.0, + "step": 4610, + "text_loss": 0.4052635431289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006469831829603598, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7439741.0, + "repeat_count": 0.0, + "routers_loss": 0.0022583482787013054, + "skip_count": 2.0, + "step": 4612, + "text_loss": 0.5443860292434692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006466873140502407, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 7443619.0, + "repeat_count": 0.0, + "routers_loss": 0.004187075886875391, + "skip_count": 2.0, + "step": 4614, + "text_loss": 0.30709847807884216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006463913889292661, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 7446696.0, + "repeat_count": 0.0, + "routers_loss": 0.008314833045005798, + "skip_count": 0.0, + "step": 4616, + "text_loss": 0.22949637472629547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006460954077108353, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7450377.0, + "repeat_count": 0.0, + "routers_loss": 0.001277514616958797, + "skip_count": 0.0, + "step": 4618, + "text_loss": 0.37715134024620056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006457993705083684, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 7453271.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756033577024937, + "skip_count": 2.0, + "step": 4620, + "text_loss": 0.7373883128166199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0006455032774353078, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7456492.0, + "repeat_count": 0.0, + "routers_loss": 0.0039057908579707146, + "skip_count": 2.0, + "step": 4622, + "text_loss": 0.5058769583702087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0006452071286051169, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 7459619.0, + "repeat_count": 0.0, + "routers_loss": 0.0019458672031760216, + "skip_count": 0.0, + "step": 4624, + "text_loss": 0.5110082030296326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0006449109241312802, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7462552.0, + "repeat_count": 0.0, + "routers_loss": 0.0002716891176532954, + "skip_count": 1.0, + "step": 4626, + "text_loss": 0.6197522878646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006446146641273042, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7466769.0, + "repeat_count": 0.0, + "routers_loss": 0.0037578947376459837, + "skip_count": 2.0, + "step": 4628, + "text_loss": 0.1653924286365509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000644318348706716, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 7470216.0, + "repeat_count": 0.0, + "routers_loss": 0.0012791058979928493, + "skip_count": 0.0, + "step": 4630, + "text_loss": 0.7114694118499756 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006440219779830643, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 7472975.0, + "repeat_count": 0.0, + "routers_loss": 0.00736592011526227, + "skip_count": 2.0, + "step": 4632, + "text_loss": 0.26601463556289673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000643725552069919, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7475672.0, + "repeat_count": 0.0, + "routers_loss": 0.00045455715735442936, + "skip_count": 0.0, + "step": 4634, + "text_loss": 0.5028402805328369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0006434290710808711, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7478850.0, + "repeat_count": 0.0, + "routers_loss": 0.004247233271598816, + "skip_count": 2.0, + "step": 4636, + "text_loss": 0.12746070325374603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.774875256824185, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006431325351295324, + "loss": 0.0083, + "macro_f1": 0.5427350401878357, + "num_tokens": 7481747.0, + "repeat_count": 1.0, + "routers_loss": 0.047564394772052765, + "skip_count": 2.0, + "step": 4638, + "text_loss": 0.24056802690029144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006428359443295362, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7484885.0, + "repeat_count": 0.0, + "routers_loss": 0.0011175100225955248, + "skip_count": 0.0, + "step": 4640, + "text_loss": 0.6265338063240051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 21.793660111535075, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006425392987945369, + "loss": 0.0086, + "macro_f1": 0.5492662787437439, + "num_tokens": 7487973.0, + "repeat_count": 0.0, + "routers_loss": 0.016879938542842865, + "skip_count": 2.0, + "step": 4642, + "text_loss": 0.2523447275161743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 21.80305253889052, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.032958984375, + "learning_rate": 0.0006422425986382093, + "loss": 0.0055, + "macro_f1": 0.5934640765190125, + "num_tokens": 7491024.0, + "repeat_count": 0.0, + "routers_loss": 0.018616504967212677, + "skip_count": 3.0, + "step": 4644, + "text_loss": 0.38890624046325684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.812444966245963, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0006419458439742496, + "loss": 0.0056, + "macro_f1": 0.3272727429866791, + "num_tokens": 7494199.0, + "repeat_count": 0.0, + "routers_loss": 0.023129139095544815, + "skip_count": 1.0, + "step": 4646, + "text_loss": 0.4060848355293274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006416490349163747, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 7497287.0, + "repeat_count": 0.0, + "routers_loss": 0.0018601802876219153, + "skip_count": 0.0, + "step": 4648, + "text_loss": 0.3387545943260193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0006413521715783225, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 7500598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017482215771451592, + "skip_count": 0.0, + "step": 4650, + "text_loss": 0.4290996193885803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.840622248312297, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006410552540738514, + "loss": 0.007, + "macro_f1": 0.3272727429866791, + "num_tokens": 7503252.0, + "repeat_count": 1.0, + "routers_loss": 0.0420118011534214, + "skip_count": 0.0, + "step": 4652, + "text_loss": 0.439496248960495 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.000640758282516741, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 7506382.0, + "repeat_count": 1.0, + "routers_loss": 0.0017782216891646385, + "skip_count": 1.0, + "step": 4654, + "text_loss": 0.8513308167457581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006404612570207911, + "loss": 0.0102, + "macro_f1": 0.3272727429866791, + "num_tokens": 7510423.0, + "repeat_count": 0.0, + "routers_loss": 0.010385853238403797, + "skip_count": 0.0, + "step": 4656, + "text_loss": 0.7159742712974548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006401641776998223, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7513394.0, + "repeat_count": 0.0, + "routers_loss": 0.0011917101219296455, + "skip_count": 0.0, + "step": 4658, + "text_loss": 0.6165401339530945 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006398670446676766, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7516828.0, + "repeat_count": 3.0, + "routers_loss": 0.008860073052346706, + "skip_count": 4.0, + "step": 4660, + "text_loss": 0.923275887966156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006395698580382153, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7519764.0, + "repeat_count": 0.0, + "routers_loss": 0.000505418807733804, + "skip_count": 0.0, + "step": 4662, + "text_loss": 0.6143050789833069 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006392726179253212, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7522390.0, + "repeat_count": 0.0, + "routers_loss": 0.004020806401968002, + "skip_count": 1.0, + "step": 4664, + "text_loss": 0.6935067176818848 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006389753244428972, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7525821.0, + "repeat_count": 1.0, + "routers_loss": 0.00957963801920414, + "skip_count": 2.0, + "step": 4666, + "text_loss": 0.3350338637828827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.915761667155856, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006386779777048666, + "loss": 0.0063, + "macro_f1": 0.6601307392120361, + "num_tokens": 7529513.0, + "repeat_count": 1.0, + "routers_loss": 0.020673364400863647, + "skip_count": 2.0, + "step": 4668, + "text_loss": 0.47800472378730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006383805778251735, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 7533450.0, + "repeat_count": 0.0, + "routers_loss": 0.007217096630483866, + "skip_count": 1.0, + "step": 4670, + "text_loss": 0.4506106972694397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 21.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0006380831249177817, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 7536287.0, + "repeat_count": 1.0, + "routers_loss": 0.007001714315265417, + "skip_count": 0.0, + "step": 4672, + "text_loss": 0.4081715941429138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006377856190966762, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 7539442.0, + "repeat_count": 0.0, + "routers_loss": 0.0015112817054614425, + "skip_count": 0.0, + "step": 4674, + "text_loss": 0.21451139450073242 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 21.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006374880604758615, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 7542594.0, + "repeat_count": 0.0, + "routers_loss": 0.007311929017305374, + "skip_count": 2.0, + "step": 4676, + "text_loss": 0.14785248041152954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 21.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0006371904491693626, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7545780.0, + "repeat_count": 0.0, + "routers_loss": 0.007489737123250961, + "skip_count": 1.0, + "step": 4678, + "text_loss": 0.2248108983039856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 21.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006368927852912247, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 7548287.0, + "repeat_count": 1.0, + "routers_loss": 0.009772555902600288, + "skip_count": 1.0, + "step": 4680, + "text_loss": 0.1566995233297348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 21.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006365950689555133, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7551424.0, + "repeat_count": 0.0, + "routers_loss": 0.002134992741048336, + "skip_count": 0.0, + "step": 4682, + "text_loss": 0.7322417497634888 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 21.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0006362973002763139, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7554182.0, + "repeat_count": 1.0, + "routers_loss": 0.008511497639119625, + "skip_count": 4.0, + "step": 4684, + "text_loss": 0.24387991428375244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006359994793677319, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 7557044.0, + "repeat_count": 0.0, + "routers_loss": 0.004151526838541031, + "skip_count": 2.0, + "step": 4686, + "text_loss": 0.6139411330223083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006357016063438928, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7560231.0, + "repeat_count": 0.0, + "routers_loss": 0.0009724601986818016, + "skip_count": 0.0, + "step": 4688, + "text_loss": 0.7875718474388123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006354036813189421, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7562953.0, + "repeat_count": 0.0, + "routers_loss": 0.0008926765876822174, + "skip_count": 0.0, + "step": 4690, + "text_loss": 0.5195512771606445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006351057044070455, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 7566137.0, + "repeat_count": 0.0, + "routers_loss": 0.0031294538639485836, + "skip_count": 0.0, + "step": 4692, + "text_loss": 0.7288873195648193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0006348076757223877, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 7569073.0, + "repeat_count": 0.0, + "routers_loss": 0.0015065820189192891, + "skip_count": 2.0, + "step": 4694, + "text_loss": 0.7242236137390137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006345095953791746, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7573025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005603441968560219, + "skip_count": 0.0, + "step": 4696, + "text_loss": 0.34443899989128113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006342114634916307, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 7576546.0, + "repeat_count": 0.0, + "routers_loss": 0.0011047758162021637, + "skip_count": 0.0, + "step": 4698, + "text_loss": 0.4892682731151581 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0006339132801740008, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7580711.0, + "repeat_count": 0.0, + "routers_loss": 0.0019803126342594624, + "skip_count": 2.0, + "step": 4700, + "text_loss": 0.4479489028453827 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006336150455405494, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 7583385.0, + "repeat_count": 1.0, + "routers_loss": 0.0005326359532773495, + "skip_count": 0.0, + "step": 4702, + "text_loss": 0.627504825592041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0006333167597055604, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 7586584.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587987834587693, + "skip_count": 0.0, + "step": 4704, + "text_loss": 0.43891432881355286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0006330184227833376, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 7590408.0, + "repeat_count": 0.0, + "routers_loss": 0.007053783163428307, + "skip_count": 2.0, + "step": 4706, + "text_loss": 0.19946859776973724 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0006327200348882043, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7593857.0, + "repeat_count": 1.0, + "routers_loss": 0.0009479080326855183, + "skip_count": 0.0, + "step": 4708, + "text_loss": 0.7973214387893677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006324215961345032, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7596429.0, + "repeat_count": 0.0, + "routers_loss": 0.0012403312139213085, + "skip_count": 0.0, + "step": 4710, + "text_loss": 0.48477989435195923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006321231066365966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7599618.0, + "repeat_count": 0.0, + "routers_loss": 0.0005520360427908599, + "skip_count": 0.0, + "step": 4712, + "text_loss": 0.44222453236579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006318245665088665, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 7603180.0, + "repeat_count": 0.0, + "routers_loss": 0.0015553623670712113, + "skip_count": 0.0, + "step": 4714, + "text_loss": 0.5132410526275635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0006315259758657138, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 7606457.0, + "repeat_count": 0.0, + "routers_loss": 0.004210884217172861, + "skip_count": 1.0, + "step": 4716, + "text_loss": 0.39850690960884094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0006312273348215589, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 7609317.0, + "repeat_count": 1.0, + "routers_loss": 0.001220117206685245, + "skip_count": 0.0, + "step": 4718, + "text_loss": 0.3509018123149872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006309286434908419, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 7613076.0, + "repeat_count": 0.0, + "routers_loss": 0.007768960203975439, + "skip_count": 2.0, + "step": 4720, + "text_loss": 0.33361560106277466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006306299019880217, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7616242.0, + "repeat_count": 0.0, + "routers_loss": 0.006226699333637953, + "skip_count": 0.0, + "step": 4722, + "text_loss": 0.23661087453365326 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006303311104275766, + "loss": 0.0073, + "macro_f1": 0.6603773832321167, + "num_tokens": 7619069.0, + "repeat_count": 1.0, + "routers_loss": 0.015590761788189411, + "skip_count": 1.0, + "step": 4724, + "text_loss": 0.23373056948184967 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006300322689240041, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 7622581.0, + "repeat_count": 1.0, + "routers_loss": 0.006862971931695938, + "skip_count": 2.0, + "step": 4726, + "text_loss": 0.8301828503608704 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006297333775918209, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 7625566.0, + "repeat_count": 1.0, + "routers_loss": 0.006256614346057177, + "skip_count": 1.0, + "step": 4728, + "text_loss": 0.3756707012653351 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006294344365455626, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 7629047.0, + "repeat_count": 1.0, + "routers_loss": 0.009151885285973549, + "skip_count": 2.0, + "step": 4730, + "text_loss": 0.33362850546836853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006291354458997841, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 7631847.0, + "repeat_count": 0.0, + "routers_loss": 0.0009307434665970504, + "skip_count": 0.0, + "step": 4732, + "text_loss": 0.4572524130344391 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006288364057690591, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7635181.0, + "repeat_count": 0.0, + "routers_loss": 0.00041220212006010115, + "skip_count": 0.0, + "step": 4734, + "text_loss": 0.40211325883865356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006285373162679804, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7637752.0, + "repeat_count": 0.0, + "routers_loss": 0.0006696670898236334, + "skip_count": 2.0, + "step": 4736, + "text_loss": 0.7588053345680237 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 22.24420311124156, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006282381775111597, + "loss": 0.0081, + "macro_f1": 0.9449735879898071, + "num_tokens": 7640719.0, + "repeat_count": 4.0, + "routers_loss": 0.016283133998513222, + "skip_count": 2.0, + "step": 4738, + "text_loss": 0.5697863101959229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0006279389896132274, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 7643524.0, + "repeat_count": 0.0, + "routers_loss": 0.00763951288536191, + "skip_count": 3.0, + "step": 4740, + "text_loss": 0.548592209815979 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.26298796595245, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006276397526888329, + "loss": 0.0094, + "macro_f1": 0.925203263759613, + "num_tokens": 7646919.0, + "repeat_count": 3.0, + "routers_loss": 0.038590483367443085, + "skip_count": 5.0, + "step": 4742, + "text_loss": 0.27226054668426514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0006273404668526443, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7650404.0, + "repeat_count": 0.0, + "routers_loss": 0.0012555639259517193, + "skip_count": 0.0, + "step": 4744, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006270411322193488, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7652942.0, + "repeat_count": 1.0, + "routers_loss": 0.0015356402145698667, + "skip_count": 0.0, + "step": 4746, + "text_loss": 0.5515767931938171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0006267417489036517, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 7656269.0, + "repeat_count": 0.0, + "routers_loss": 0.005182140972465277, + "skip_count": 0.0, + "step": 4748, + "text_loss": 0.3496028184890747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006264423170202773, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7658664.0, + "repeat_count": 0.0, + "routers_loss": 0.004144361708313227, + "skip_count": 0.0, + "step": 4750, + "text_loss": 0.2786032557487488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0006261428366839685, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 7661471.0, + "repeat_count": 0.0, + "routers_loss": 0.00035335420398041606, + "skip_count": 0.0, + "step": 4752, + "text_loss": 0.4838487505912781 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0006258433080094868, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 7664593.0, + "repeat_count": 0.0, + "routers_loss": 0.0103341368958354, + "skip_count": 2.0, + "step": 4754, + "text_loss": 0.24325360357761383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006255437311116119, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 7667573.0, + "repeat_count": 0.0, + "routers_loss": 0.014633853919804096, + "skip_count": 2.0, + "step": 4756, + "text_loss": 0.21569855511188507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0006252441061051426, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7671171.0, + "repeat_count": 0.0, + "routers_loss": 0.004900569561868906, + "skip_count": 0.0, + "step": 4758, + "text_loss": 0.12832018733024597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006249444331048955, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 7673932.0, + "repeat_count": 0.0, + "routers_loss": 0.0020371589343994856, + "skip_count": 0.0, + "step": 4760, + "text_loss": 0.38652482628822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.000624644712225706, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7677396.0, + "repeat_count": 0.0, + "routers_loss": 0.0028059002943336964, + "skip_count": 2.0, + "step": 4762, + "text_loss": 0.7937633395195007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0006243449435824276, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 7680392.0, + "repeat_count": 0.0, + "routers_loss": 0.0007225095760077238, + "skip_count": 0.0, + "step": 4764, + "text_loss": 0.5690395832061768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006240451272899321, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 7684121.0, + "repeat_count": 0.0, + "routers_loss": 0.002052050782367587, + "skip_count": 1.0, + "step": 4766, + "text_loss": 0.5321336984634399 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006237452634631099, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7687236.0, + "repeat_count": 1.0, + "routers_loss": 0.0039039517287164927, + "skip_count": 0.0, + "step": 4768, + "text_loss": 0.30823320150375366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.394481948928675, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0006234453522168694, + "loss": 0.0084, + "macro_f1": 0.5492662787437439, + "num_tokens": 7690355.0, + "repeat_count": 0.0, + "routers_loss": 0.014570238068699837, + "skip_count": 2.0, + "step": 4770, + "text_loss": 0.21501587331295013 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 22.403874376284122, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04541015625, + "learning_rate": 0.000623145393666137, + "loss": 0.0069, + "macro_f1": 0.886363685131073, + "num_tokens": 7693559.0, + "repeat_count": 3.0, + "routers_loss": 0.061707716435194016, + "skip_count": 6.0, + "step": 4772, + "text_loss": 0.24371100962162018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0006228453879258576, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 7696422.0, + "repeat_count": 0.0, + "routers_loss": 0.005053870379924774, + "skip_count": 2.0, + "step": 4774, + "text_loss": 0.237778440117836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0006225453351109934, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7700460.0, + "repeat_count": 0.0, + "routers_loss": 0.0017990898340940475, + "skip_count": 0.0, + "step": 4776, + "text_loss": 0.612456738948822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.000622245235336526, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7703330.0, + "repeat_count": 0.0, + "routers_loss": 0.004507021512836218, + "skip_count": 2.0, + "step": 4778, + "text_loss": 0.36898812651634216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006219450887174537, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7707243.0, + "repeat_count": 0.0, + "routers_loss": 0.006295828148722649, + "skip_count": 1.0, + "step": 4780, + "text_loss": 0.14474599063396454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006216448953687932, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7711121.0, + "repeat_count": 0.0, + "routers_loss": 0.005049831233918667, + "skip_count": 0.0, + "step": 4782, + "text_loss": 0.4696790277957916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0006213446554055795, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7714889.0, + "repeat_count": 0.0, + "routers_loss": 0.0006010758224874735, + "skip_count": 0.0, + "step": 4784, + "text_loss": 0.46253830194473267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 22.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006210443689428649, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 7718420.0, + "repeat_count": 3.0, + "routers_loss": 0.006691234186291695, + "skip_count": 1.0, + "step": 4786, + "text_loss": 0.579987645149231 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00062074403609572, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 7721720.0, + "repeat_count": 0.0, + "routers_loss": 0.001864895923063159, + "skip_count": 0.0, + "step": 4788, + "text_loss": 0.325242817401886 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006204436569792324, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 7724916.0, + "repeat_count": 0.0, + "routers_loss": 0.00202955212444067, + "skip_count": 0.0, + "step": 4790, + "text_loss": 0.49637556076049805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 22.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006201432317085083, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 7728081.0, + "repeat_count": 1.0, + "routers_loss": 0.0037843603640794754, + "skip_count": 0.0, + "step": 4792, + "text_loss": 0.38812628388404846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0006198427603986711, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 7731457.0, + "repeat_count": 0.0, + "routers_loss": 0.012036679312586784, + "skip_count": 3.0, + "step": 4794, + "text_loss": 0.2996312379837036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0006195422431648623, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 7734595.0, + "repeat_count": 0.0, + "routers_loss": 0.0008874868508428335, + "skip_count": 1.0, + "step": 4796, + "text_loss": 0.3203189969062805 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0006192416801222403, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 7737565.0, + "repeat_count": 1.0, + "routers_loss": 0.0032894534524530172, + "skip_count": 1.0, + "step": 4798, + "text_loss": 0.3283322751522064 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006189410713859815, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 7740439.0, + "repeat_count": 0.0, + "routers_loss": 0.009667043574154377, + "skip_count": 2.0, + "step": 4800, + "text_loss": 0.25219282507896423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 22.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006186404170712797, + "loss": 0.0093, + "macro_f1": 0.6666666865348816, + "num_tokens": 7743813.0, + "repeat_count": 0.0, + "routers_loss": 0.012643060646951199, + "skip_count": 4.0, + "step": 4802, + "text_loss": 0.22567439079284668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006183397172933462, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 7747182.0, + "repeat_count": 0.0, + "routers_loss": 0.002678517485037446, + "skip_count": 0.0, + "step": 4804, + "text_loss": 0.19188879430294037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0006180389721674101, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 7750735.0, + "repeat_count": 0.0, + "routers_loss": 0.0013385121710598469, + "skip_count": 0.0, + "step": 4806, + "text_loss": 0.5860441327095032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000617738181808717, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7753843.0, + "repeat_count": 0.0, + "routers_loss": 0.0034869094379246235, + "skip_count": 1.0, + "step": 4808, + "text_loss": 0.4366260766983032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006174373463325306, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7757039.0, + "repeat_count": 0.0, + "routers_loss": 0.0013648992171511054, + "skip_count": 0.0, + "step": 4810, + "text_loss": 0.5217258334159851 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0006171364658541314, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 7760016.0, + "repeat_count": 1.0, + "routers_loss": 0.0038017008919268847, + "skip_count": 2.0, + "step": 4812, + "text_loss": 0.8130963444709778 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0006168355404888177, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 7762961.0, + "repeat_count": 0.0, + "routers_loss": 0.006867518648505211, + "skip_count": 2.0, + "step": 4814, + "text_loss": 0.17822521924972534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006165345703519043, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7766399.0, + "repeat_count": 0.0, + "routers_loss": 0.0004653502255678177, + "skip_count": 0.0, + "step": 4816, + "text_loss": 0.5316070914268494 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006162335555587238, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 7769039.0, + "repeat_count": 1.0, + "routers_loss": 0.0016906452365219593, + "skip_count": 1.0, + "step": 4818, + "text_loss": 0.5680997967720032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006159324962246257, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 7772768.0, + "repeat_count": 0.0, + "routers_loss": 0.002541248919442296, + "skip_count": 0.0, + "step": 4820, + "text_loss": 0.6169226169586182 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0006156313924649762, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 7775545.0, + "repeat_count": 0.0, + "routers_loss": 0.008644679561257362, + "skip_count": 2.0, + "step": 4822, + "text_loss": 0.2211475968360901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0006153302443951589, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7778837.0, + "repeat_count": 0.0, + "routers_loss": 0.0041346061043441296, + "skip_count": 2.0, + "step": 4824, + "text_loss": 0.5369775891304016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0006150290521305746, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 7782309.0, + "repeat_count": 0.0, + "routers_loss": 0.0012756052892655134, + "skip_count": 0.0, + "step": 4826, + "text_loss": 0.5294989943504333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.666862342236573, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006147278157866403, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 7785565.0, + "repeat_count": 0.0, + "routers_loss": 0.029718991369009018, + "skip_count": 1.0, + "step": 4828, + "text_loss": 0.6920449733734131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0006144265354787906, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7788218.0, + "repeat_count": 0.0, + "routers_loss": 0.004829924553632736, + "skip_count": 0.0, + "step": 4830, + "text_loss": 0.17072243988513947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0006141252113224767, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7790788.0, + "repeat_count": 0.0, + "routers_loss": 0.00254037044942379, + "skip_count": 0.0, + "step": 4832, + "text_loss": 0.20075996220111847 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 0.0006138238434331666, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 7793913.0, + "repeat_count": 0.0, + "routers_loss": 0.0004426188243087381, + "skip_count": 0.0, + "step": 4834, + "text_loss": 0.695742130279541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.000613522431926345, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 7796932.0, + "repeat_count": 1.0, + "routers_loss": 0.005176798906177282, + "skip_count": 3.0, + "step": 4836, + "text_loss": 0.4910822808742523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006132209769175132, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7800686.0, + "repeat_count": 0.0, + "routers_loss": 0.004120545461773872, + "skip_count": 0.0, + "step": 4838, + "text_loss": 0.3701378405094147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0006129194785221894, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7804765.0, + "repeat_count": 0.0, + "routers_loss": 0.0043835826218128204, + "skip_count": 0.0, + "step": 4840, + "text_loss": 0.343635618686676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006126179368559086, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 7807498.0, + "repeat_count": 0.0, + "routers_loss": 0.001394893741235137, + "skip_count": 1.0, + "step": 4842, + "text_loss": 0.47756674885749817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.000612316352034222, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7810784.0, + "repeat_count": 0.0, + "routers_loss": 0.0031262130942195654, + "skip_count": 2.0, + "step": 4844, + "text_loss": 0.13077901303768158 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.751394188435572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006120147241726972, + "loss": 0.0081, + "macro_f1": 0.8823530077934265, + "num_tokens": 7814754.0, + "repeat_count": 2.0, + "routers_loss": 0.016139274463057518, + "skip_count": 1.0, + "step": 4846, + "text_loss": 0.18850074708461761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0006117130533869189, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 7818245.0, + "repeat_count": 0.0, + "routers_loss": 0.0009124451316893101, + "skip_count": 0.0, + "step": 4848, + "text_loss": 0.42503559589385986 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006114113397924878, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 7822214.0, + "repeat_count": 0.0, + "routers_loss": 0.0015132242115214467, + "skip_count": 0.0, + "step": 4850, + "text_loss": 0.16767354309558868 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006111095835050212, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 7825019.0, + "repeat_count": 2.0, + "routers_loss": 0.006253300234675407, + "skip_count": 2.0, + "step": 4852, + "text_loss": 0.44826745986938477 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0006108077846401524, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 7828113.0, + "repeat_count": 0.0, + "routers_loss": 0.0024391328915953636, + "skip_count": 0.0, + "step": 4854, + "text_loss": 0.2009880244731903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0006105059433135317, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 7831177.0, + "repeat_count": 1.0, + "routers_loss": 0.0020866121631115675, + "skip_count": 1.0, + "step": 4856, + "text_loss": 0.7082528471946716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0006102040596408251, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 7834485.0, + "repeat_count": 0.0, + "routers_loss": 0.004373365081846714, + "skip_count": 1.0, + "step": 4858, + "text_loss": 0.2541539669036865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0006099021337377148, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7837749.0, + "repeat_count": 0.0, + "routers_loss": 0.004309024661779404, + "skip_count": 0.0, + "step": 4860, + "text_loss": 0.3163885176181793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 22.82653360727913, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006096001657198995, + "loss": 0.0065, + "macro_f1": 0.6122449040412903, + "num_tokens": 7840979.0, + "repeat_count": 0.0, + "routers_loss": 0.023044804111123085, + "skip_count": 4.0, + "step": 4862, + "text_loss": 0.49609798192977905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 22.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0006092981557030941, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 7844905.0, + "repeat_count": 1.0, + "routers_loss": 0.010683654807507992, + "skip_count": 3.0, + "step": 4864, + "text_loss": 0.16866883635520935 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0006089961038030291, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7847800.0, + "repeat_count": 0.0, + "routers_loss": 0.0011224723421037197, + "skip_count": 0.0, + "step": 4866, + "text_loss": 0.5093055367469788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0006086940101354515, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 7850983.0, + "repeat_count": 0.0, + "routers_loss": 0.003944621421396732, + "skip_count": 1.0, + "step": 4868, + "text_loss": 0.5753747224807739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 22.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006083918748161244, + "loss": 0.0069, + "macro_f1": 0.5492662787437439, + "num_tokens": 7855041.0, + "repeat_count": 0.0, + "routers_loss": 0.02532145567238331, + "skip_count": 2.0, + "step": 4870, + "text_loss": 0.8082366585731506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0006080896979608262, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 7858058.0, + "repeat_count": 0.0, + "routers_loss": 0.0007558314246125519, + "skip_count": 0.0, + "step": 4872, + "text_loss": 0.6476574540138245 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.000607787479685352, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 7861223.0, + "repeat_count": 0.0, + "routers_loss": 0.0009224560926668346, + "skip_count": 0.0, + "step": 4874, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0006074852201055121, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 7864180.0, + "repeat_count": 0.0, + "routers_loss": 0.0028308273758739233, + "skip_count": 0.0, + "step": 4876, + "text_loss": 0.7447214722633362 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.0006071829193371331, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 7866726.0, + "repeat_count": 0.0, + "routers_loss": 0.0021505290642380714, + "skip_count": 0.0, + "step": 4878, + "text_loss": 0.5444929599761963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006068805774960573, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7870166.0, + "repeat_count": 0.0, + "routers_loss": 0.0021109723020344973, + "skip_count": 0.0, + "step": 4880, + "text_loss": 0.3577263355255127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0006065781946981425, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7873028.0, + "repeat_count": 0.0, + "routers_loss": 0.0027144821360707283, + "skip_count": 0.0, + "step": 4882, + "text_loss": 0.28464797139167786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006062757710592624, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7876747.0, + "repeat_count": 0.0, + "routers_loss": 0.0004638207610696554, + "skip_count": 0.0, + "step": 4884, + "text_loss": 0.381534606218338 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006059733066953066, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 7879524.0, + "repeat_count": 1.0, + "routers_loss": 0.002225410658866167, + "skip_count": 2.0, + "step": 4886, + "text_loss": 0.5167883634567261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0006056708017221796, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7882809.0, + "repeat_count": 0.0, + "routers_loss": 0.00419368501752615, + "skip_count": 1.0, + "step": 4888, + "text_loss": 0.22688335180282593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000605368256255802, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 7886310.0, + "repeat_count": 0.0, + "routers_loss": 0.0017340193735435605, + "skip_count": 1.0, + "step": 4890, + "text_loss": 1.0128135681152344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 22.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006050656704121098, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 7889483.0, + "repeat_count": 0.0, + "routers_loss": 0.0016647159354761243, + "skip_count": 0.0, + "step": 4892, + "text_loss": 0.2213262915611267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 22.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006047630443070547, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 7892615.0, + "repeat_count": 0.0, + "routers_loss": 0.0038971947506070137, + "skip_count": 3.0, + "step": 4894, + "text_loss": 0.45751357078552246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 22.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0006044603780566032, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 7895747.0, + "repeat_count": 1.0, + "routers_loss": 0.0036852145567536354, + "skip_count": 1.0, + "step": 4896, + "text_loss": 0.13489919900894165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 22.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0006041576717767379, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 7899155.0, + "repeat_count": 0.0, + "routers_loss": 0.007661987561732531, + "skip_count": 1.0, + "step": 4898, + "text_loss": 0.281853586435318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0006038549255834563, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 7901667.0, + "repeat_count": 2.0, + "routers_loss": 0.01836695335805416, + "skip_count": 5.0, + "step": 4900, + "text_loss": 0.24879895150661469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.000603552139592771, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7904506.0, + "repeat_count": 0.0, + "routers_loss": 0.0011829182039946318, + "skip_count": 0.0, + "step": 4902, + "text_loss": 0.7550268769264221 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006032493139207106, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 7907316.0, + "repeat_count": 1.0, + "routers_loss": 0.0022891140542924404, + "skip_count": 0.0, + "step": 4904, + "text_loss": 0.37596020102500916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006029464486833186, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 7911283.0, + "repeat_count": 0.0, + "routers_loss": 0.001990227960050106, + "skip_count": 0.0, + "step": 4906, + "text_loss": 0.5879577994346619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0006026435439966531, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 7913907.0, + "repeat_count": 0.0, + "routers_loss": 0.0026039890944957733, + "skip_count": 1.0, + "step": 4908, + "text_loss": 0.41484713554382324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006023405999767879, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 7916772.0, + "repeat_count": 0.0, + "routers_loss": 0.009183229878544807, + "skip_count": 1.0, + "step": 4910, + "text_loss": 0.20732562243938446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006020376167398116, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 7919346.0, + "repeat_count": 0.0, + "routers_loss": 0.005508727394044399, + "skip_count": 1.0, + "step": 4912, + "text_loss": 0.41416165232658386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 27.0, + "epoch": 23.070443205165834, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0006017345944018284, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 7922404.0, + "repeat_count": 0.0, + "routers_loss": 0.008651934564113617, + "skip_count": 0.0, + "step": 4914, + "text_loss": 0.4290519952774048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0006014315330789563, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 7925165.0, + "repeat_count": 0.0, + "routers_loss": 0.003601635340601206, + "skip_count": 1.0, + "step": 4916, + "text_loss": 0.8447931408882141 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006011284328873296, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 7928146.0, + "repeat_count": 1.0, + "routers_loss": 0.0049415635876357555, + "skip_count": 2.0, + "step": 4918, + "text_loss": 0.32237401604652405 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0006008252939430967, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 7931163.0, + "repeat_count": 0.0, + "routers_loss": 0.0024150956887751818, + "skip_count": 0.0, + "step": 4920, + "text_loss": 0.2251713126897812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.108012914587615, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006005221163624209, + "loss": 0.0057, + "macro_f1": 0.3272727429866791, + "num_tokens": 7934084.0, + "repeat_count": 1.0, + "routers_loss": 0.03181030973792076, + "skip_count": 0.0, + "step": 4922, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006002189002614806, + "loss": 0.0089, + "macro_f1": 0.6666666865348816, + "num_tokens": 7937021.0, + "repeat_count": 0.0, + "routers_loss": 0.00227518193423748, + "skip_count": 2.0, + "step": 4924, + "text_loss": 0.34440335631370544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005999156457564685, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 7940205.0, + "repeat_count": 0.0, + "routers_loss": 0.004331593867391348, + "skip_count": 1.0, + "step": 4926, + "text_loss": 0.14114083349704742 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005996123529635925, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 7945174.0, + "repeat_count": 0.0, + "routers_loss": 0.000612895586527884, + "skip_count": 0.0, + "step": 4928, + "text_loss": 0.3895469009876251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.145582624009393, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000599309021999075, + "loss": 0.006, + "macro_f1": 0.3272727429866791, + "num_tokens": 7948716.0, + "repeat_count": 0.0, + "routers_loss": 0.02319233864545822, + "skip_count": 1.0, + "step": 4930, + "text_loss": 0.38103172183036804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005990056529791528, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 7952497.0, + "repeat_count": 0.0, + "routers_loss": 0.003423231653869152, + "skip_count": 0.0, + "step": 4932, + "text_loss": 0.30447322130203247 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0005987022460200778, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 7955578.0, + "repeat_count": 0.0, + "routers_loss": 0.0007005351362749934, + "skip_count": 0.0, + "step": 4934, + "text_loss": 0.49621838331222534 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.173759906075727, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005983988012381159, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 7958741.0, + "repeat_count": 2.0, + "routers_loss": 0.03962617367506027, + "skip_count": 1.0, + "step": 4936, + "text_loss": 0.1920493096113205 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0005980953187495476, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 7962236.0, + "repeat_count": 0.0, + "routers_loss": 0.0026006060652434826, + "skip_count": 3.0, + "step": 4938, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005977917986706681, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7965631.0, + "repeat_count": 0.0, + "routers_loss": 0.005010952707380056, + "skip_count": 0.0, + "step": 4940, + "text_loss": 0.3507745563983917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005974882411177871, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7968516.0, + "repeat_count": 0.0, + "routers_loss": 0.0023964287247508764, + "skip_count": 0.0, + "step": 4942, + "text_loss": 0.9110504388809204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000597184646207228, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 7971310.0, + "repeat_count": 0.0, + "routers_loss": 0.0026230409275740385, + "skip_count": 1.0, + "step": 4944, + "text_loss": 0.4131232798099518 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005968810140553292, + "loss": 0.0102, + "macro_f1": 0.3333333432674408, + "num_tokens": 7974809.0, + "repeat_count": 0.0, + "routers_loss": 0.0007397596491500735, + "skip_count": 0.0, + "step": 4946, + "text_loss": 0.5130466222763062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005965773447784431, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 7977800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009955473942682147, + "skip_count": 0.0, + "step": 4948, + "text_loss": 0.5366153717041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0005962736384929362, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 7981027.0, + "repeat_count": 0.0, + "routers_loss": 0.0049227322451770306, + "skip_count": 0.0, + "step": 4950, + "text_loss": 0.17266370356082916 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005959698953151895, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 7983580.0, + "repeat_count": 0.0, + "routers_loss": 0.0009975163266062737, + "skip_count": 0.0, + "step": 4952, + "text_loss": 0.2474549114704132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0005956661153615979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 7986711.0, + "repeat_count": 0.0, + "routers_loss": 0.0006475782720372081, + "skip_count": 0.0, + "step": 4954, + "text_loss": 0.5748327970504761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0005953622987485703, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 7990194.0, + "repeat_count": 0.0, + "routers_loss": 0.001449751085601747, + "skip_count": 0.0, + "step": 4956, + "text_loss": 0.5163559317588806 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005950584455925301, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 7993050.0, + "repeat_count": 0.0, + "routers_loss": 0.0017087773885577917, + "skip_count": 0.0, + "step": 4958, + "text_loss": 0.15892620384693146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005947545560099142, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 7996383.0, + "repeat_count": 0.0, + "routers_loss": 0.0044417232275009155, + "skip_count": 0.0, + "step": 4960, + "text_loss": 0.48022928833961487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 23.295861461696507, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005944506301171734, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 7999843.0, + "repeat_count": 0.0, + "routers_loss": 0.010093312710523605, + "skip_count": 2.0, + "step": 4962, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005941466680307732, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8003504.0, + "repeat_count": 0.0, + "routers_loss": 0.009699694812297821, + "skip_count": 0.0, + "step": 4964, + "text_loss": 0.30474427342414856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 23.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005938426698671922, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 8007427.0, + "repeat_count": 1.0, + "routers_loss": 0.0016759657301008701, + "skip_count": 0.0, + "step": 4966, + "text_loss": 0.25060293078422546 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005935386357429232, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8010265.0, + "repeat_count": 2.0, + "routers_loss": 0.006916914135217667, + "skip_count": 3.0, + "step": 4968, + "text_loss": 0.49084481596946716 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005932345657744723, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 8013733.0, + "repeat_count": 1.0, + "routers_loss": 0.017182426527142525, + "skip_count": 5.0, + "step": 4970, + "text_loss": 0.2705717980861664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00059293046007836, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8017068.0, + "repeat_count": 0.0, + "routers_loss": 0.008485594764351845, + "skip_count": 2.0, + "step": 4972, + "text_loss": 0.18570218980312347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0005926263187711201, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8020185.0, + "repeat_count": 0.0, + "routers_loss": 0.0021750847809016705, + "skip_count": 2.0, + "step": 4974, + "text_loss": 0.4457069933414459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0005923221419693001, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 8023038.0, + "repeat_count": 0.0, + "routers_loss": 0.0020193420350551605, + "skip_count": 0.0, + "step": 4976, + "text_loss": 0.7394505143165588 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005920179297894613, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8026236.0, + "repeat_count": 0.0, + "routers_loss": 0.001450369250960648, + "skip_count": 1.0, + "step": 4978, + "text_loss": 0.5914503335952759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.000591713682348178, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8028765.0, + "repeat_count": 0.0, + "routers_loss": 0.0017808573320508003, + "skip_count": 0.0, + "step": 4980, + "text_loss": 0.19231407344341278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005914093997620388, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8032043.0, + "repeat_count": 0.0, + "routers_loss": 0.0018225493840873241, + "skip_count": 0.0, + "step": 4982, + "text_loss": 0.3567875325679779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005911050821476449, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8035086.0, + "repeat_count": 0.0, + "routers_loss": 0.0016285666497424245, + "skip_count": 0.0, + "step": 4984, + "text_loss": 0.34609633684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0005908007296216119, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8038193.0, + "repeat_count": 0.0, + "routers_loss": 0.0014699801104143262, + "skip_count": 0.0, + "step": 4986, + "text_loss": 0.4492359757423401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.000590496342300568, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8041099.0, + "repeat_count": 0.0, + "routers_loss": 0.002442725468426943, + "skip_count": 0.0, + "step": 4988, + "text_loss": 0.5162975788116455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005901919203011548, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8044350.0, + "repeat_count": 0.0, + "routers_loss": 0.008624207228422165, + "skip_count": 2.0, + "step": 4990, + "text_loss": 0.2533033490180969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005898874637400279, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8047467.0, + "repeat_count": 0.0, + "routers_loss": 0.0015421364223584533, + "skip_count": 0.0, + "step": 4992, + "text_loss": 0.4890289306640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005895829727338552, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 8050626.0, + "repeat_count": 1.0, + "routers_loss": 0.0024516626726835966, + "skip_count": 2.0, + "step": 4994, + "text_loss": 0.50797039270401 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005892784473993184, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8053386.0, + "repeat_count": 0.0, + "routers_loss": 0.0018553845584392548, + "skip_count": 2.0, + "step": 4996, + "text_loss": 0.628828763961792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.000588973887853112, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8055941.0, + "repeat_count": 0.0, + "routers_loss": 0.004258487373590469, + "skip_count": 0.0, + "step": 4998, + "text_loss": 0.2643229067325592 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.474317581449956, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005886692942119441, + "loss": 0.0062, + "macro_f1": 0.8820862174034119, + "num_tokens": 8058638.0, + "repeat_count": 2.0, + "routers_loss": 0.019064312800765038, + "skip_count": 2.0, + "step": 5000, + "text_loss": 0.4925006031990051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005883646665925353, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 8062097.0, + "repeat_count": 0.0, + "routers_loss": 0.0007969749276526272, + "skip_count": 0.0, + "step": 5002, + "text_loss": 0.49412909150123596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005880600051116196, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8065202.0, + "repeat_count": 0.0, + "routers_loss": 0.005813780706375837, + "skip_count": 2.0, + "step": 5004, + "text_loss": 0.5681346654891968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005877553098859439, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8068574.0, + "repeat_count": 0.0, + "routers_loss": 0.005012941546738148, + "skip_count": 0.0, + "step": 5006, + "text_loss": 0.2682424485683441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005874505810322678, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 8071834.0, + "repeat_count": 0.0, + "routers_loss": 0.005859757773578167, + "skip_count": 3.0, + "step": 5008, + "text_loss": 0.6460036039352417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.000587145818667364, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8074687.0, + "repeat_count": 0.0, + "routers_loss": 0.002868571551516652, + "skip_count": 2.0, + "step": 5010, + "text_loss": 0.2405751347541809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005868410229080181, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8077617.0, + "repeat_count": 0.0, + "routers_loss": 0.0021759893279522657, + "skip_count": 1.0, + "step": 5012, + "text_loss": 0.7455595135688782 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005865361938710286, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8080734.0, + "repeat_count": 0.0, + "routers_loss": 0.0008311949786730111, + "skip_count": 0.0, + "step": 5014, + "text_loss": 0.44876906275749207 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 23.549457000293515, + "f1_execute": 0.9756097793579102, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0390625, + "learning_rate": 0.0005862313316732063, + "loss": 0.0054, + "macro_f1": 0.9615669250488281, + "num_tokens": 8085092.0, + "repeat_count": 2.0, + "routers_loss": 0.012511664070189, + "skip_count": 6.0, + "step": 5016, + "text_loss": 0.26010942459106445 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.000585926436431375, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 8088333.0, + "repeat_count": 0.0, + "routers_loss": 0.0035441694781184196, + "skip_count": 0.0, + "step": 5018, + "text_loss": 0.28225192427635193 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 23.568241855004402, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005856215082623711, + "loss": 0.0093, + "macro_f1": 0.8823530077934265, + "num_tokens": 8091298.0, + "repeat_count": 1.0, + "routers_loss": 0.023543989285826683, + "skip_count": 2.0, + "step": 5020, + "text_loss": 0.5757577419281006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0005853165472830439, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8094361.0, + "repeat_count": 0.0, + "routers_loss": 0.003124240320175886, + "skip_count": 0.0, + "step": 5022, + "text_loss": 0.4021305739879608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005850115536102546, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8097514.0, + "repeat_count": 0.0, + "routers_loss": 0.008170558139681816, + "skip_count": 1.0, + "step": 5024, + "text_loss": 0.18926584720611572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005847065273608777, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 8100525.0, + "repeat_count": 1.0, + "routers_loss": 0.02127663604915142, + "skip_count": 5.0, + "step": 5026, + "text_loss": 0.18827557563781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005844014686517998, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8104016.0, + "repeat_count": 0.0, + "routers_loss": 0.00272122910246253, + "skip_count": 0.0, + "step": 5028, + "text_loss": 0.15534701943397522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 23.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005840963775999199, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8106697.0, + "repeat_count": 5.0, + "routers_loss": 0.008979840204119682, + "skip_count": 4.0, + "step": 5030, + "text_loss": 0.8123718500137329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005837912543221493, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8110986.0, + "repeat_count": 0.0, + "routers_loss": 0.005006929859519005, + "skip_count": 0.0, + "step": 5032, + "text_loss": 0.26128846406936646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005834860989354121, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 8114010.0, + "repeat_count": 0.0, + "routers_loss": 0.0005531277856789529, + "skip_count": 0.0, + "step": 5034, + "text_loss": 0.5100266933441162 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.64338127384796, + "f1_execute": 0.9615384340286255, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005831809115566442, + "loss": 0.0073, + "macro_f1": 0.6538461446762085, + "num_tokens": 8117168.0, + "repeat_count": 2.0, + "routers_loss": 0.04978533461689949, + "skip_count": 1.0, + "step": 5036, + "text_loss": 0.41049885749816895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005828756923027941, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8119900.0, + "repeat_count": 0.0, + "routers_loss": 0.0006322385743260384, + "skip_count": 0.0, + "step": 5038, + "text_loss": 0.5584380626678467 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005825704412908225, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8123928.0, + "repeat_count": 0.0, + "routers_loss": 0.001000594231300056, + "skip_count": 0.0, + "step": 5040, + "text_loss": 0.6460791230201721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005822651586377019, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 8127926.0, + "repeat_count": 0.0, + "routers_loss": 0.011595834977924824, + "skip_count": 2.0, + "step": 5042, + "text_loss": 0.3131820261478424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0005819598444604173, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8131092.0, + "repeat_count": 0.0, + "routers_loss": 0.004449303261935711, + "skip_count": 3.0, + "step": 5044, + "text_loss": 0.2774372696876526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0005816544988759658, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8134051.0, + "repeat_count": 0.0, + "routers_loss": 0.0007877505850046873, + "skip_count": 0.0, + "step": 5046, + "text_loss": 0.39496293663978577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0005813491220013563, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 8138725.0, + "repeat_count": 0.0, + "routers_loss": 0.002868623472750187, + "skip_count": 0.0, + "step": 5048, + "text_loss": 0.3779948651790619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005810437139536098, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 8141913.0, + "repeat_count": 2.0, + "routers_loss": 0.006244937423616648, + "skip_count": 4.0, + "step": 5050, + "text_loss": 0.4512978494167328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06396484375, + "learning_rate": 0.0005807382748497592, + "loss": 0.0112, + "macro_f1": 0.3333333432674408, + "num_tokens": 8146193.0, + "repeat_count": 0.0, + "routers_loss": 0.0011013929033651948, + "skip_count": 0.0, + "step": 5052, + "text_loss": 0.6194499731063843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005804328048068493, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8149701.0, + "repeat_count": 0.0, + "routers_loss": 0.005505079869180918, + "skip_count": 1.0, + "step": 5054, + "text_loss": 0.2932305335998535 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005801273039419368, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8152861.0, + "repeat_count": 1.0, + "routers_loss": 0.0057641929015517235, + "skip_count": 1.0, + "step": 5056, + "text_loss": 0.2631317973136902 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 23.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005798217723720904, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 8155843.0, + "repeat_count": 1.0, + "routers_loss": 0.0021671492140740156, + "skip_count": 5.0, + "step": 5058, + "text_loss": 0.2889988422393799 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0005795162102143902, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8158812.0, + "repeat_count": 0.0, + "routers_loss": 0.004476628266274929, + "skip_count": 1.0, + "step": 5060, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005792106175859283, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8162719.0, + "repeat_count": 1.0, + "routers_loss": 0.0038497636560350657, + "skip_count": 3.0, + "step": 5062, + "text_loss": 0.4559471607208252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0005789049946038083, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8165692.0, + "repeat_count": 0.0, + "routers_loss": 0.004451582673937082, + "skip_count": 0.0, + "step": 5064, + "text_loss": 0.3782602548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005785993413851456, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8168900.0, + "repeat_count": 0.0, + "routers_loss": 0.002951978938654065, + "skip_count": 0.0, + "step": 5066, + "text_loss": 0.32392629981040955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.000578293658047067, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8171661.0, + "repeat_count": 0.0, + "routers_loss": 0.011171254329383373, + "skip_count": 2.0, + "step": 5068, + "text_loss": 0.24492619931697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005779879447067109, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8175075.0, + "repeat_count": 0.0, + "routers_loss": 0.0016067599644884467, + "skip_count": 0.0, + "step": 5070, + "text_loss": 0.7738823294639587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.000577682201481227, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8178515.0, + "repeat_count": 0.0, + "routers_loss": 0.009113503620028496, + "skip_count": 1.0, + "step": 5072, + "text_loss": 0.2082248032093048 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 23.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005773764284877774, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8181790.0, + "repeat_count": 1.0, + "routers_loss": 0.007332196459174156, + "skip_count": 1.0, + "step": 5074, + "text_loss": 0.4557662904262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005770706258435342, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8184854.0, + "repeat_count": 0.0, + "routers_loss": 0.0016252279747277498, + "skip_count": 0.0, + "step": 5076, + "text_loss": 0.2888098657131195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005767647936656818, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8187860.0, + "repeat_count": 0.0, + "routers_loss": 0.003406575648114085, + "skip_count": 0.0, + "step": 5078, + "text_loss": 0.6533790230751038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005764589320714158, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8191683.0, + "repeat_count": 0.0, + "routers_loss": 0.0006520140450447798, + "skip_count": 0.0, + "step": 5080, + "text_loss": 0.6903796195983887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0005761530411779426, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8195109.0, + "repeat_count": 0.0, + "routers_loss": 0.01188349537551403, + "skip_count": 1.0, + "step": 5082, + "text_loss": 0.20460398495197296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.083984375, + "learning_rate": 0.0005758471211024804, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 8198340.0, + "repeat_count": 0.0, + "routers_loss": 0.004826809279620647, + "skip_count": 3.0, + "step": 5084, + "text_loss": 0.2203969657421112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0005755411719622584, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8200882.0, + "repeat_count": 0.0, + "routers_loss": 0.0019170823507010937, + "skip_count": 0.0, + "step": 5086, + "text_loss": 0.6744595170021057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005752351938745167, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 8203777.0, + "repeat_count": 0.0, + "routers_loss": 0.002110893838107586, + "skip_count": 1.0, + "step": 5088, + "text_loss": 0.4137859046459198 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000574929186956507, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8207627.0, + "repeat_count": 0.0, + "routers_loss": 0.0018580821342766285, + "skip_count": 1.0, + "step": 5090, + "text_loss": 0.4830456078052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.906369239800412, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005746231513254912, + "loss": 0.0066, + "macro_f1": 0.3272727429866791, + "num_tokens": 8210263.0, + "repeat_count": 1.0, + "routers_loss": 0.0194723978638649, + "skip_count": 0.0, + "step": 5092, + "text_loss": 0.17383277416229248 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 23.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005743170870987433, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 8214166.0, + "repeat_count": 0.0, + "routers_loss": 0.006944256369024515, + "skip_count": 2.0, + "step": 5094, + "text_loss": 0.20003484189510345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005740109943935472, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8217545.0, + "repeat_count": 0.0, + "routers_loss": 0.002044794149696827, + "skip_count": 1.0, + "step": 5096, + "text_loss": 0.5117167830467224 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 23.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005737048733271986, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 8220673.0, + "repeat_count": 1.0, + "routers_loss": 0.009966124780476093, + "skip_count": 2.0, + "step": 5098, + "text_loss": 0.2705996036529541 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0005733987240170035, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8223796.0, + "repeat_count": 0.0, + "routers_loss": 0.0009675708715803921, + "skip_count": 0.0, + "step": 5100, + "text_loss": 0.7016357183456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0005730925465802788, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8227048.0, + "repeat_count": 0.0, + "routers_loss": 0.0009548200177960098, + "skip_count": 0.0, + "step": 5102, + "text_loss": 0.30823078751564026 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005727863411343526, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8229971.0, + "repeat_count": 0.0, + "routers_loss": 0.0005767418188042939, + "skip_count": 0.0, + "step": 5104, + "text_loss": 0.6897505521774292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 23.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005724801077965629, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8232758.0, + "repeat_count": 0.0, + "routers_loss": 0.009297889657318592, + "skip_count": 3.0, + "step": 5106, + "text_loss": 0.21293514966964722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 23.981508658643968, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005721738466842592, + "loss": 0.0079, + "macro_f1": 0.3272727429866791, + "num_tokens": 8238154.0, + "repeat_count": 1.0, + "routers_loss": 0.013964693062007427, + "skip_count": 0.0, + "step": 5108, + "text_loss": 0.7273620367050171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 23.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005718675579148014, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8240818.0, + "repeat_count": 3.0, + "routers_loss": 0.007218098267912865, + "skip_count": 1.0, + "step": 5110, + "text_loss": 0.5607150793075562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005715612416055598, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 8244048.0, + "repeat_count": 0.0, + "routers_loss": 0.007558444049209356, + "skip_count": 2.0, + "step": 5112, + "text_loss": 0.23694385588169098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.009392427355444, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005712548978739154, + "loss": 0.0072, + "macro_f1": 0.6603773832321167, + "num_tokens": 8247240.0, + "repeat_count": 1.0, + "routers_loss": 0.015726923942565918, + "skip_count": 1.0, + "step": 5114, + "text_loss": 0.6032099723815918 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.01878485471089, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005709485268372598, + "loss": 0.0046, + "macro_f1": 0.9262410998344421, + "num_tokens": 8250585.0, + "repeat_count": 3.0, + "routers_loss": 0.011148860678076744, + "skip_count": 2.0, + "step": 5116, + "text_loss": 0.6825997233390808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005706421286129948, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 8254240.0, + "repeat_count": 0.0, + "routers_loss": 0.006977916229516268, + "skip_count": 0.0, + "step": 5118, + "text_loss": 0.2532844543457031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005703357033185328, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8257133.0, + "repeat_count": 0.0, + "routers_loss": 0.006415650714188814, + "skip_count": 2.0, + "step": 5120, + "text_loss": 0.6132124066352844 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005700292510712967, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 8261076.0, + "repeat_count": 1.0, + "routers_loss": 0.0044475216418504715, + "skip_count": 1.0, + "step": 5122, + "text_loss": 0.4277699887752533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0005697227719887194, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8264607.0, + "repeat_count": 0.0, + "routers_loss": 0.005743155721575022, + "skip_count": 2.0, + "step": 5124, + "text_loss": 0.2570968270301819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005694162661882444, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8267992.0, + "repeat_count": 0.0, + "routers_loss": 0.0007581565878354013, + "skip_count": 0.0, + "step": 5126, + "text_loss": 0.5850184559822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005691097337873252, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 8271010.0, + "repeat_count": 0.0, + "routers_loss": 0.0036611228715628386, + "skip_count": 0.0, + "step": 5128, + "text_loss": 0.660999059677124 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0005688031749034258, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 8273638.0, + "repeat_count": 0.0, + "routers_loss": 0.0039906189776957035, + "skip_count": 0.0, + "step": 5130, + "text_loss": 0.5839648246765137 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0005684965896540198, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8276504.0, + "repeat_count": 1.0, + "routers_loss": 0.007539632264524698, + "skip_count": 3.0, + "step": 5132, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 24.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005681899781565915, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 8279977.0, + "repeat_count": 2.0, + "routers_loss": 0.0026953567285090685, + "skip_count": 0.0, + "step": 5134, + "text_loss": 0.532974123954773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.000567883340528635, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 8282781.0, + "repeat_count": 0.0, + "routers_loss": 0.005754240322858095, + "skip_count": 1.0, + "step": 5136, + "text_loss": 0.31100207567214966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005675766768876542, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8286533.0, + "repeat_count": 0.0, + "routers_loss": 0.0051517849788069725, + "skip_count": 0.0, + "step": 5138, + "text_loss": 0.5734741687774658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005672699873511635, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 8289858.0, + "repeat_count": 0.0, + "routers_loss": 0.0025852699764072895, + "skip_count": 2.0, + "step": 5140, + "text_loss": 0.37045374512672424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005669632720366868, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8293038.0, + "repeat_count": 0.0, + "routers_loss": 0.0038520018570125103, + "skip_count": 0.0, + "step": 5142, + "text_loss": 0.25952374935150146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005666565310617577, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8295717.0, + "repeat_count": 0.0, + "routers_loss": 0.00026914477348327637, + "skip_count": 0.0, + "step": 5144, + "text_loss": 0.32531213760375977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0005663497645439203, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8299750.0, + "repeat_count": 0.0, + "routers_loss": 0.0055860537104308605, + "skip_count": 2.0, + "step": 5146, + "text_loss": 0.2520618438720703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005660429726007279, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8303075.0, + "repeat_count": 0.0, + "routers_loss": 0.004446739796549082, + "skip_count": 1.0, + "step": 5148, + "text_loss": 0.43672287464141846 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.07080078125, + "learning_rate": 0.000565736155349744, + "loss": 0.0076, + "macro_f1": 0.8814815282821655, + "num_tokens": 8306268.0, + "repeat_count": 2.0, + "routers_loss": 0.046915046870708466, + "skip_count": 4.0, + "step": 5150, + "text_loss": 0.35405927896499634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005654293129085412, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8310480.0, + "repeat_count": 0.0, + "routers_loss": 0.010549088008701801, + "skip_count": 4.0, + "step": 5152, + "text_loss": 0.3523249626159668 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005651224453947023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8313367.0, + "repeat_count": 1.0, + "routers_loss": 0.002893900265917182, + "skip_count": 0.0, + "step": 5154, + "text_loss": 0.4503810703754425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005648155529258195, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8318006.0, + "repeat_count": 0.0, + "routers_loss": 0.0018450213829055429, + "skip_count": 0.0, + "step": 5156, + "text_loss": 0.5687127113342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005645086356194943, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8320646.0, + "repeat_count": 0.0, + "routers_loss": 0.0026727779768407345, + "skip_count": 0.0, + "step": 5158, + "text_loss": 0.38920050859451294 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005642016935933385, + "loss": 0.0035, + "macro_f1": 1.0, + "num_tokens": 8323915.0, + "repeat_count": 1.0, + "routers_loss": 0.00611621281132102, + "skip_count": 2.0, + "step": 5160, + "text_loss": 0.3003547787666321 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 24.234810683886117, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005638947269649726, + "loss": 0.0063, + "macro_f1": 0.9619450569152832, + "num_tokens": 8327073.0, + "repeat_count": 1.0, + "routers_loss": 0.028447439894080162, + "skip_count": 6.0, + "step": 5162, + "text_loss": 0.24053414165973663 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005635877358520268, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8330388.0, + "repeat_count": 0.0, + "routers_loss": 0.0013072624569758773, + "skip_count": 0.0, + "step": 5164, + "text_loss": 0.43772217631340027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005632807203721406, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 8333241.0, + "repeat_count": 0.0, + "routers_loss": 0.0009456822881475091, + "skip_count": 0.0, + "step": 5166, + "text_loss": 0.5217573046684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000562973680642963, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8337257.0, + "repeat_count": 0.0, + "routers_loss": 0.0023840824142098427, + "skip_count": 0.0, + "step": 5168, + "text_loss": 0.31814974546432495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005626666167821521, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8340143.0, + "repeat_count": 0.0, + "routers_loss": 0.0020231492817401886, + "skip_count": 3.0, + "step": 5170, + "text_loss": 0.5478505492210388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.0005623595289073755, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 8343566.0, + "repeat_count": 1.0, + "routers_loss": 0.01070715207606554, + "skip_count": 2.0, + "step": 5172, + "text_loss": 0.23213914036750793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005620524171363099, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8346836.0, + "repeat_count": 0.0, + "routers_loss": 0.003720001084730029, + "skip_count": 3.0, + "step": 5174, + "text_loss": 0.5114789009094238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005617452815866409, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 8349726.0, + "repeat_count": 1.0, + "routers_loss": 0.003322509117424488, + "skip_count": 1.0, + "step": 5176, + "text_loss": 0.4894506335258484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005614381223760635, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 8352478.0, + "repeat_count": 0.0, + "routers_loss": 0.00028752797516062856, + "skip_count": 0.0, + "step": 5178, + "text_loss": 0.6418307423591614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005611309396222817, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8355766.0, + "repeat_count": 0.0, + "routers_loss": 0.0028724796138703823, + "skip_count": 0.0, + "step": 5180, + "text_loss": 0.23635952174663544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.328734957440563, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005608237334430085, + "loss": 0.0068, + "macro_f1": 0.6601307392120361, + "num_tokens": 8358888.0, + "repeat_count": 1.0, + "routers_loss": 0.058520980179309845, + "skip_count": 2.0, + "step": 5182, + "text_loss": 0.23434793949127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.1015625, + "learning_rate": 0.000560516503955966, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8361761.0, + "repeat_count": 0.0, + "routers_loss": 0.0021356395445764065, + "skip_count": 1.0, + "step": 5184, + "text_loss": 0.40855672955513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000560209251278885, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 8364376.0, + "repeat_count": 0.0, + "routers_loss": 0.0016185789136216044, + "skip_count": 0.0, + "step": 5186, + "text_loss": 0.6265131831169128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005599019755295053, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8367769.0, + "repeat_count": 0.0, + "routers_loss": 0.0031490204855799675, + "skip_count": 2.0, + "step": 5188, + "text_loss": 0.4716353118419647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0005595946768255756, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8370705.0, + "repeat_count": 1.0, + "routers_loss": 0.003500689286738634, + "skip_count": 0.0, + "step": 5190, + "text_loss": 0.5467679500579834 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0005592873552848532, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 8374217.0, + "repeat_count": 2.0, + "routers_loss": 0.010764475911855698, + "skip_count": 3.0, + "step": 5192, + "text_loss": 0.4345340132713318 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005589800110251045, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 8378182.0, + "repeat_count": 2.0, + "routers_loss": 0.0010365343187004328, + "skip_count": 1.0, + "step": 5194, + "text_loss": 0.46722909808158875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005586726441641044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8381227.0, + "repeat_count": 0.0, + "routers_loss": 0.006349093746393919, + "skip_count": 2.0, + "step": 5196, + "text_loss": 0.35410359501838684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0005583652548196362, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8384886.0, + "repeat_count": 0.0, + "routers_loss": 0.00038166221929714084, + "skip_count": 0.0, + "step": 5198, + "text_loss": 0.5950250625610352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005580578431094924, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8388939.0, + "repeat_count": 0.0, + "routers_loss": 0.0023578559048473835, + "skip_count": 2.0, + "step": 5200, + "text_loss": 0.6553771495819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0005577504091514735, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8391629.0, + "repeat_count": 0.0, + "routers_loss": 0.0010771085508167744, + "skip_count": 0.0, + "step": 5202, + "text_loss": 0.4441985785961151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.000557442953063389, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8394440.0, + "repeat_count": 0.0, + "routers_loss": 0.005844325292855501, + "skip_count": 3.0, + "step": 5204, + "text_loss": 0.5807011723518372 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005571354749630564, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8397731.0, + "repeat_count": 0.0, + "routers_loss": 0.006837233901023865, + "skip_count": 1.0, + "step": 5206, + "text_loss": 0.27780941128730774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.000556827974968302, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8400859.0, + "repeat_count": 0.0, + "routers_loss": 0.007656649220734835, + "skip_count": 3.0, + "step": 5208, + "text_loss": 0.4746324121952057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005565204531969606, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8404164.0, + "repeat_count": 0.0, + "routers_loss": 0.0028129038400948048, + "skip_count": 1.0, + "step": 5210, + "text_loss": 0.8513513803482056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005562129097668746, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8407196.0, + "repeat_count": 0.0, + "routers_loss": 0.00492360582575202, + "skip_count": 1.0, + "step": 5212, + "text_loss": 0.12255420535802841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005559053447958958, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8410633.0, + "repeat_count": 0.0, + "routers_loss": 0.0020713545382022858, + "skip_count": 0.0, + "step": 5214, + "text_loss": 0.6878522634506226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0005555977584018833, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8413414.0, + "repeat_count": 0.0, + "routers_loss": 0.0007216963567771018, + "skip_count": 0.0, + "step": 5216, + "text_loss": 0.845878541469574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005552901507027048, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8416817.0, + "repeat_count": 0.0, + "routers_loss": 0.002400130731984973, + "skip_count": 1.0, + "step": 5218, + "text_loss": 0.16753672063350677 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0005549825218162365, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 8419617.0, + "repeat_count": 0.0, + "routers_loss": 0.004563181661069393, + "skip_count": 0.0, + "step": 5220, + "text_loss": 0.26107168197631836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.000554674871860362, + "loss": 0.0086, + "macro_f1": 1.0, + "num_tokens": 8422686.0, + "repeat_count": 1.0, + "routers_loss": 0.006413881666958332, + "skip_count": 1.0, + "step": 5222, + "text_loss": 0.6333847045898438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005543672009529734, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 8425571.0, + "repeat_count": 0.0, + "routers_loss": 0.0057656955905258656, + "skip_count": 3.0, + "step": 5224, + "text_loss": 0.4552212357521057 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 24.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005540595092119709, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 8429038.0, + "repeat_count": 2.0, + "routers_loss": 0.011755156330764294, + "skip_count": 2.0, + "step": 5226, + "text_loss": 0.16597330570220947 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0005537517967552626, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8432117.0, + "repeat_count": 0.0, + "routers_loss": 0.0007519085193052888, + "skip_count": 0.0, + "step": 5228, + "text_loss": 0.6283590197563171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.064453125, + "learning_rate": 0.000553444063700764, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8435176.0, + "repeat_count": 0.0, + "routers_loss": 0.003066456411033869, + "skip_count": 0.0, + "step": 5230, + "text_loss": 0.2360922247171402 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0005531363101663998, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8438515.0, + "repeat_count": 0.0, + "routers_loss": 0.002865589689463377, + "skip_count": 0.0, + "step": 5232, + "text_loss": 0.8075396418571472 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005528285362701011, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 8441731.0, + "repeat_count": 0.0, + "routers_loss": 0.0012521179160103202, + "skip_count": 0.0, + "step": 5234, + "text_loss": 0.584335446357727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005525207421298077, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 8444535.0, + "repeat_count": 0.0, + "routers_loss": 0.005398475099354982, + "skip_count": 3.0, + "step": 5236, + "text_loss": 0.22711622714996338 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005522129278634669, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8448337.0, + "repeat_count": 0.0, + "routers_loss": 0.002957914723083377, + "skip_count": 1.0, + "step": 5238, + "text_loss": 0.3157515823841095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 24.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0005519050935890335, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 8451530.0, + "repeat_count": 0.0, + "routers_loss": 0.007757039275020361, + "skip_count": 3.0, + "step": 5240, + "text_loss": 0.2815830111503601 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.610507778103905, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005515972394244704, + "loss": 0.0063, + "macro_f1": 0.6603773832321167, + "num_tokens": 8454171.0, + "repeat_count": 1.0, + "routers_loss": 0.021602008491754532, + "skip_count": 1.0, + "step": 5242, + "text_loss": 0.6024490594863892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005512893654877478, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8457544.0, + "repeat_count": 0.0, + "routers_loss": 0.006062488537281752, + "skip_count": 0.0, + "step": 5244, + "text_loss": 0.550110936164856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005509814718968435, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 8460135.0, + "repeat_count": 0.0, + "routers_loss": 0.002793943975120783, + "skip_count": 0.0, + "step": 5246, + "text_loss": 0.4361286163330078 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0005506735587697433, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8463516.0, + "repeat_count": 0.0, + "routers_loss": 0.0016669550677761436, + "skip_count": 0.0, + "step": 5248, + "text_loss": 0.4642958641052246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005503656262244395, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 8466406.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051387754268944, + "skip_count": 0.0, + "step": 5250, + "text_loss": 0.3445641100406647 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 24.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0005500576743789329, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 8468838.0, + "repeat_count": 2.0, + "routers_loss": 0.00654293829575181, + "skip_count": 1.0, + "step": 5252, + "text_loss": 0.2842808663845062 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.666862342236573, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005497497033512309, + "loss": 0.0077, + "macro_f1": 0.8817967176437378, + "num_tokens": 8471815.0, + "repeat_count": 2.0, + "routers_loss": 0.03845973685383797, + "skip_count": 3.0, + "step": 5254, + "text_loss": 0.2597215175628662 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 24.676254769592017, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005494417132593487, + "loss": 0.0047, + "macro_f1": 0.9452888369560242, + "num_tokens": 8475202.0, + "repeat_count": 1.0, + "routers_loss": 0.02252381667494774, + "skip_count": 4.0, + "step": 5256, + "text_loss": 0.32269927859306335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005491337042213088, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8478650.0, + "repeat_count": 0.0, + "routers_loss": 0.01232751365751028, + "skip_count": 2.0, + "step": 5258, + "text_loss": 0.6523372530937195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005488256763551408, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8481724.0, + "repeat_count": 0.0, + "routers_loss": 0.0028322834987193346, + "skip_count": 0.0, + "step": 5260, + "text_loss": 0.4212580621242523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0005485176297788814, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 8485833.0, + "repeat_count": 0.0, + "routers_loss": 0.002623105887323618, + "skip_count": 2.0, + "step": 5262, + "text_loss": 0.16906329989433289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005482095646105748, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8489089.0, + "repeat_count": 1.0, + "routers_loss": 0.0007179114618338645, + "skip_count": 0.0, + "step": 5264, + "text_loss": 0.4523872137069702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0005479014809682721, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 8492905.0, + "repeat_count": 0.0, + "routers_loss": 0.005234059412032366, + "skip_count": 0.0, + "step": 5266, + "text_loss": 0.207139790058136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0005475933789700314, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 8495480.0, + "repeat_count": 0.0, + "routers_loss": 0.0023258263245224953, + "skip_count": 0.0, + "step": 5268, + "text_loss": 0.18060965836048126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005472852587339183, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8499070.0, + "repeat_count": 0.0, + "routers_loss": 0.0013497259933501482, + "skip_count": 0.0, + "step": 5270, + "text_loss": 0.7460769414901733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.056640625, + "learning_rate": 0.0005469771203780048, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 8502886.0, + "repeat_count": 0.0, + "routers_loss": 0.0003589815751183778, + "skip_count": 0.0, + "step": 5272, + "text_loss": 0.48119160532951355 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005466689640203701, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8506646.0, + "repeat_count": 0.0, + "routers_loss": 0.006619705818593502, + "skip_count": 1.0, + "step": 5274, + "text_loss": 0.15656520426273346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005463607897791005, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 8509450.0, + "repeat_count": 0.0, + "routers_loss": 0.002992175053805113, + "skip_count": 1.0, + "step": 5276, + "text_loss": 0.486930251121521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005460525977722886, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8512851.0, + "repeat_count": 0.0, + "routers_loss": 0.0027784097474068403, + "skip_count": 0.0, + "step": 5278, + "text_loss": 0.19654682278633118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0005457443881180345, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8516858.0, + "repeat_count": 0.0, + "routers_loss": 0.0017648129723966122, + "skip_count": 0.0, + "step": 5280, + "text_loss": 0.580982506275177 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005454361609344444, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 8519912.0, + "repeat_count": 2.0, + "routers_loss": 0.010817649774253368, + "skip_count": 3.0, + "step": 5282, + "text_loss": 0.2644204795360565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.000545127916339632, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8522396.0, + "repeat_count": 0.0, + "routers_loss": 0.001453282660804689, + "skip_count": 0.0, + "step": 5284, + "text_loss": 0.5014839172363281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005448196544517168, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8525326.0, + "repeat_count": 0.0, + "routers_loss": 0.006645771209150553, + "skip_count": 2.0, + "step": 5286, + "text_loss": 0.2983154058456421 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005445113753888254, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8528611.0, + "repeat_count": 0.0, + "routers_loss": 0.0005447337171062827, + "skip_count": 0.0, + "step": 5288, + "text_loss": 0.43598243594169617 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.000544203079269091, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8531571.0, + "repeat_count": 0.0, + "routers_loss": 0.0026976624503731728, + "skip_count": 0.0, + "step": 5290, + "text_loss": 0.6454944610595703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005438947662106533, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 8534565.0, + "repeat_count": 0.0, + "routers_loss": 0.002217630622908473, + "skip_count": 0.0, + "step": 5292, + "text_loss": 0.742935836315155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 29.0, + "epoch": 24.854710889345466, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029052734375, + "learning_rate": 0.0005435864363316584, + "loss": 0.0073, + "macro_f1": 0.8820862174034119, + "num_tokens": 8537581.0, + "repeat_count": 2.0, + "routers_loss": 0.030740609392523766, + "skip_count": 2.0, + "step": 5294, + "text_loss": 0.48913639783859253 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005432780897502588, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8541271.0, + "repeat_count": 0.0, + "routers_loss": 0.005306888837367296, + "skip_count": 1.0, + "step": 5296, + "text_loss": 0.5820846557617188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 24.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.0005429697265846137, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8545052.0, + "repeat_count": 1.0, + "routers_loss": 0.002255369909107685, + "skip_count": 0.0, + "step": 5298, + "text_loss": 0.565483808517456 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0005426613469528881, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8548605.0, + "repeat_count": 0.0, + "routers_loss": 0.0010787079809233546, + "skip_count": 0.0, + "step": 5300, + "text_loss": 0.40154510736465454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.000542352950973254, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8552581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017972089117392898, + "skip_count": 0.0, + "step": 5302, + "text_loss": 0.5430748462677002 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005420445387638891, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 8556360.0, + "repeat_count": 0.0, + "routers_loss": 0.0016180560924112797, + "skip_count": 2.0, + "step": 5304, + "text_loss": 0.544040322303772 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0005417361104429777, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 8559264.0, + "repeat_count": 1.0, + "routers_loss": 0.012688961811363697, + "skip_count": 2.0, + "step": 5306, + "text_loss": 0.2018517404794693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005414276661287101, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8562169.0, + "repeat_count": 0.0, + "routers_loss": 0.0012141643092036247, + "skip_count": 0.0, + "step": 5308, + "text_loss": 0.5685747265815735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005411192059392826, + "loss": 0.0098, + "macro_f1": 0.3333333432674408, + "num_tokens": 8565231.0, + "repeat_count": 0.0, + "routers_loss": 0.0015626107342541218, + "skip_count": 0.0, + "step": 5310, + "text_loss": 0.8073471784591675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005408107299928979, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8568122.0, + "repeat_count": 0.0, + "routers_loss": 0.004773529712110758, + "skip_count": 0.0, + "step": 5312, + "text_loss": 0.22583355009555817 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 24.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005405022384077644, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8571056.0, + "repeat_count": 0.0, + "routers_loss": 0.0025621228851377964, + "skip_count": 1.0, + "step": 5314, + "text_loss": 0.25274428725242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 24.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0005401937313020967, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 8574300.0, + "repeat_count": 0.0, + "routers_loss": 0.009726752527058125, + "skip_count": 2.0, + "step": 5316, + "text_loss": 0.3283393979072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 24.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005398852087941155, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8577424.0, + "repeat_count": 0.0, + "routers_loss": 0.012483839876949787, + "skip_count": 4.0, + "step": 5318, + "text_loss": 0.1876130849123001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.000539576671002047, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 8580309.0, + "repeat_count": 0.0, + "routers_loss": 0.0009830677881836891, + "skip_count": 0.0, + "step": 5320, + "text_loss": 0.6955490708351135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046875, + "learning_rate": 0.0005392681180441235, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 8583399.0, + "repeat_count": 0.0, + "routers_loss": 0.0010819481685757637, + "skip_count": 0.0, + "step": 5322, + "text_loss": 0.4708341956138611 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 24.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.000538959550038583, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8586259.0, + "repeat_count": 0.0, + "routers_loss": 0.005763369146734476, + "skip_count": 0.0, + "step": 5324, + "text_loss": 0.20463642477989197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005386509671036695, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8589067.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229027640074492, + "skip_count": 0.0, + "step": 5326, + "text_loss": 0.6819888353347778 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 25.014088641033165, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005383423693576325, + "loss": 0.0087, + "macro_f1": 0.9619450569152832, + "num_tokens": 8592837.0, + "repeat_count": 1.0, + "routers_loss": 0.030066559091210365, + "skip_count": 6.0, + "step": 5328, + "text_loss": 0.24606549739837646 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005380337569187272, + "loss": 0.0092, + "macro_f1": 0.6666666865348816, + "num_tokens": 8596293.0, + "repeat_count": 1.0, + "routers_loss": 0.007445990107953548, + "skip_count": 0.0, + "step": 5330, + "text_loss": 0.16730253398418427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0005377251299052145, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 8599360.0, + "repeat_count": 1.0, + "routers_loss": 0.004563331138342619, + "skip_count": 1.0, + "step": 5332, + "text_loss": 0.6856988668441772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0005374164884353608, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8602376.0, + "repeat_count": 0.0, + "routers_loss": 0.0015491938684135675, + "skip_count": 0.0, + "step": 5334, + "text_loss": 1.3248854875564575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005371078326274382, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8605400.0, + "repeat_count": 0.0, + "routers_loss": 0.0016098044579848647, + "skip_count": 0.0, + "step": 5336, + "text_loss": 0.747150182723999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0005367991625997243, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8608100.0, + "repeat_count": 0.0, + "routers_loss": 0.0034471298567950726, + "skip_count": 3.0, + "step": 5338, + "text_loss": 0.6443291902542114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005364904784705015, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8611768.0, + "repeat_count": 0.0, + "routers_loss": 0.007947597652673721, + "skip_count": 1.0, + "step": 5340, + "text_loss": 0.7768037915229797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 25.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005361817803580588, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 8614424.0, + "repeat_count": 2.0, + "routers_loss": 0.009964234195649624, + "skip_count": 2.0, + "step": 5342, + "text_loss": 0.22826914489269257 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005358730683806896, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8617826.0, + "repeat_count": 0.0, + "routers_loss": 0.0014116480015218258, + "skip_count": 0.0, + "step": 5344, + "text_loss": 0.49022090435028076 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 25.098620487232168, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005355643426566929, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 8621220.0, + "repeat_count": 1.0, + "routers_loss": 0.013940622098743916, + "skip_count": 2.0, + "step": 5346, + "text_loss": 0.26819515228271484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.000535255603304373, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8623957.0, + "repeat_count": 0.0, + "routers_loss": 0.0032230091746896505, + "skip_count": 2.0, + "step": 5348, + "text_loss": 0.46905452013015747 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005349468504420395, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8626760.0, + "repeat_count": 0.0, + "routers_loss": 0.002631337149068713, + "skip_count": 1.0, + "step": 5350, + "text_loss": 0.5312309861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005346380841880068, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8630207.0, + "repeat_count": 0.0, + "routers_loss": 0.004526057746261358, + "skip_count": 2.0, + "step": 5352, + "text_loss": 0.5810666084289551 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0005343293046605949, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8633241.0, + "repeat_count": 0.0, + "routers_loss": 0.0023941127583384514, + "skip_count": 0.0, + "step": 5354, + "text_loss": 0.18468725681304932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0005340205119781288, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8636215.0, + "repeat_count": 1.0, + "routers_loss": 0.0017020340310409665, + "skip_count": 0.0, + "step": 5356, + "text_loss": 0.6665788888931274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005337117062589383, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8639326.0, + "repeat_count": 0.0, + "routers_loss": 0.004964717663824558, + "skip_count": 2.0, + "step": 5358, + "text_loss": 0.19770404696464539 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005334028876213585, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8642157.0, + "repeat_count": 0.0, + "routers_loss": 0.006587155628949404, + "skip_count": 0.0, + "step": 5360, + "text_loss": 0.2295130044221878 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0005330940561837291, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8645355.0, + "repeat_count": 0.0, + "routers_loss": 0.0006586945964954793, + "skip_count": 0.0, + "step": 5362, + "text_loss": 0.2701159417629242 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005327852120643947, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8648911.0, + "repeat_count": 1.0, + "routers_loss": 0.0014281768817454576, + "skip_count": 0.0, + "step": 5364, + "text_loss": 0.8957229852676392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0005324763553817053, + "loss": 0.0027, + "macro_f1": 0.3333333432674408, + "num_tokens": 8652037.0, + "repeat_count": 0.0, + "routers_loss": 0.0005899337120354176, + "skip_count": 0.0, + "step": 5366, + "text_loss": 0.38642236590385437 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.20193718814206, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005321674862540154, + "loss": 0.0058, + "macro_f1": 0.9265305995941162, + "num_tokens": 8655381.0, + "repeat_count": 3.0, + "routers_loss": 0.024511313065886497, + "skip_count": 1.0, + "step": 5368, + "text_loss": 0.6439879536628723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000531858604799684, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8658476.0, + "repeat_count": 0.0, + "routers_loss": 0.0012558114249259233, + "skip_count": 0.0, + "step": 5370, + "text_loss": 0.3227672874927521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005315497111370752, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 8661982.0, + "repeat_count": 0.0, + "routers_loss": 0.0013541636290028691, + "skip_count": 0.0, + "step": 5372, + "text_loss": 0.6375321745872498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.230114470208395, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005312408053845575, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 8665071.0, + "repeat_count": 0.0, + "routers_loss": 0.010432626120746136, + "skip_count": 2.0, + "step": 5374, + "text_loss": 0.536924421787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005309318876605042, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8668411.0, + "repeat_count": 0.0, + "routers_loss": 0.004450209904462099, + "skip_count": 1.0, + "step": 5376, + "text_loss": 0.2643466889858246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005306229580832933, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 8672088.0, + "repeat_count": 1.0, + "routers_loss": 0.011189920827746391, + "skip_count": 3.0, + "step": 5378, + "text_loss": 0.8259533047676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.000530314016771307, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8675206.0, + "repeat_count": 0.0, + "routers_loss": 0.0020095291547477245, + "skip_count": 0.0, + "step": 5380, + "text_loss": 0.31364113092422485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.267684179630173, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005300050638429324, + "loss": 0.0078, + "macro_f1": 0.3272727429866791, + "num_tokens": 8678289.0, + "repeat_count": 0.0, + "routers_loss": 0.010738557204604149, + "skip_count": 1.0, + "step": 5382, + "text_loss": 0.19013966619968414 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0005296960994165607, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8681555.0, + "repeat_count": 0.0, + "routers_loss": 0.0018534278497099876, + "skip_count": 1.0, + "step": 5384, + "text_loss": 0.762248694896698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0005293871236105877, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 8684413.0, + "repeat_count": 0.0, + "routers_loss": 0.009143726900219917, + "skip_count": 2.0, + "step": 5386, + "text_loss": 0.19994212687015533 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 25.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005290781365434134, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8687450.0, + "repeat_count": 2.0, + "routers_loss": 0.002034468576312065, + "skip_count": 0.0, + "step": 5388, + "text_loss": 0.5519160628318787 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005287691383334425, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8690651.0, + "repeat_count": 1.0, + "routers_loss": 0.006834167055785656, + "skip_count": 0.0, + "step": 5390, + "text_loss": 0.5439304709434509 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005284601290990832, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8693929.0, + "repeat_count": 1.0, + "routers_loss": 0.0022327799815684557, + "skip_count": 0.0, + "step": 5392, + "text_loss": 0.24108269810676575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0005281511089587491, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8696727.0, + "repeat_count": 0.0, + "routers_loss": 0.002669565612450242, + "skip_count": 0.0, + "step": 5394, + "text_loss": 0.8659077286720276 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005278420780308568, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8700934.0, + "repeat_count": 0.0, + "routers_loss": 0.007252473384141922, + "skip_count": 0.0, + "step": 5396, + "text_loss": 0.5592793226242065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005275330364338276, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 8704449.0, + "repeat_count": 0.0, + "routers_loss": 0.001793015981093049, + "skip_count": 0.0, + "step": 5398, + "text_loss": 0.5211784243583679 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 25.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0005272239842860868, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 8707384.0, + "repeat_count": 5.0, + "routers_loss": 0.00963665172457695, + "skip_count": 4.0, + "step": 5400, + "text_loss": 0.6092788577079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03515625, + "learning_rate": 0.0005269149217060642, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 8710453.0, + "repeat_count": 0.0, + "routers_loss": 0.01758105307817459, + "skip_count": 2.0, + "step": 5402, + "text_loss": 0.3423936069011688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005266058488121926, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8713514.0, + "repeat_count": 0.0, + "routers_loss": 0.0025636721402406693, + "skip_count": 1.0, + "step": 5404, + "text_loss": 0.484171986579895 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.38039330789551, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0005262967657229095, + "loss": 0.0064, + "macro_f1": 0.9255813956260681, + "num_tokens": 8717051.0, + "repeat_count": 3.0, + "routers_loss": 0.022406045347452164, + "skip_count": 4.0, + "step": 5406, + "text_loss": 0.23368191719055176 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005259876725566563, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8719987.0, + "repeat_count": 0.0, + "routers_loss": 0.004114408977329731, + "skip_count": 2.0, + "step": 5408, + "text_loss": 0.20237496495246887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.000525678569431878, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 8723258.0, + "repeat_count": 0.0, + "routers_loss": 0.006741158664226532, + "skip_count": 2.0, + "step": 5410, + "text_loss": 0.7969435453414917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0005253694564670233, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 8726294.0, + "repeat_count": 0.0, + "routers_loss": 0.0034468702506273985, + "skip_count": 0.0, + "step": 5412, + "text_loss": 0.5533816814422607 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.000525060333780545, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8729603.0, + "repeat_count": 0.0, + "routers_loss": 0.01086533535271883, + "skip_count": 2.0, + "step": 5414, + "text_loss": 0.31856611371040344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005247512014908998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 8733423.0, + "repeat_count": 0.0, + "routers_loss": 0.00512756546959281, + "skip_count": 6.0, + "step": 5416, + "text_loss": 0.6710903644561768 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06103515625, + "learning_rate": 0.0005244420597165472, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 8736457.0, + "repeat_count": 0.0, + "routers_loss": 0.0026201079599559307, + "skip_count": 0.0, + "step": 5418, + "text_loss": 0.6469964981079102 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005241329085759514, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 8739617.0, + "repeat_count": 0.0, + "routers_loss": 0.004130818881094456, + "skip_count": 0.0, + "step": 5420, + "text_loss": 0.4868837296962738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005238237481875795, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8742653.0, + "repeat_count": 0.0, + "routers_loss": 0.003171122632920742, + "skip_count": 0.0, + "step": 5422, + "text_loss": 0.12026242166757584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005235145786699021, + "loss": 0.0091, + "macro_f1": 0.3333333432674408, + "num_tokens": 8745835.0, + "repeat_count": 0.0, + "routers_loss": 0.0008553664083592594, + "skip_count": 0.0, + "step": 5424, + "text_loss": 0.601640522480011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0005232054001413941, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8749006.0, + "repeat_count": 0.0, + "routers_loss": 0.0006958908052183688, + "skip_count": 0.0, + "step": 5426, + "text_loss": 0.7083519101142883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 0.0005228962127205329, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 8752493.0, + "repeat_count": 0.0, + "routers_loss": 0.0012221037177368999, + "skip_count": 1.0, + "step": 5428, + "text_loss": 0.3949109613895416 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005225870165257997, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 8755294.0, + "repeat_count": 1.0, + "routers_loss": 0.003924673888832331, + "skip_count": 2.0, + "step": 5430, + "text_loss": 0.7487186789512634 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005222778116756793, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 8758043.0, + "repeat_count": 0.0, + "routers_loss": 0.002388258930295706, + "skip_count": 0.0, + "step": 5432, + "text_loss": 0.4092858135700226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005219685982886594, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 8760618.0, + "repeat_count": 1.0, + "routers_loss": 0.0045886957086622715, + "skip_count": 0.0, + "step": 5434, + "text_loss": 0.5889580249786377 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005216593764832311, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 8764269.0, + "repeat_count": 1.0, + "routers_loss": 0.00704155582934618, + "skip_count": 2.0, + "step": 5436, + "text_loss": 0.2634117007255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005213501463778889, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8767142.0, + "repeat_count": 0.0, + "routers_loss": 0.00368728069588542, + "skip_count": 2.0, + "step": 5438, + "text_loss": 0.3512301445007324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005210409080911304, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8770239.0, + "repeat_count": 0.0, + "routers_loss": 0.0012925115879625082, + "skip_count": 0.0, + "step": 5440, + "text_loss": 0.9330073595046997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0005207316617414561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8772927.0, + "repeat_count": 0.0, + "routers_loss": 0.005604506935924292, + "skip_count": 0.0, + "step": 5442, + "text_loss": 0.23477613925933838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.55884942764896, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0005204224074473701, + "loss": 0.0049, + "macro_f1": 0.6601307392120361, + "num_tokens": 8776451.0, + "repeat_count": 1.0, + "routers_loss": 0.010945434682071209, + "skip_count": 2.0, + "step": 5444, + "text_loss": 0.6184295415878296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005201131453273789, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8779481.0, + "repeat_count": 0.0, + "routers_loss": 0.0024414353538304567, + "skip_count": 0.0, + "step": 5446, + "text_loss": 0.16186967492103577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.57763428235985, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0005198038754999926, + "loss": 0.0052, + "macro_f1": 0.3272727429866791, + "num_tokens": 8782425.0, + "repeat_count": 1.0, + "routers_loss": 0.013872416689991951, + "skip_count": 0.0, + "step": 5448, + "text_loss": 0.42294546961784363 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0005194945980837237, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 8785466.0, + "repeat_count": 0.0, + "routers_loss": 0.0006147907115519047, + "skip_count": 0.0, + "step": 5450, + "text_loss": 0.6285432577133179 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0005191853131970881, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 8788461.0, + "repeat_count": 0.0, + "routers_loss": 0.0010585964191704988, + "skip_count": 0.0, + "step": 5452, + "text_loss": 0.6032317876815796 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005188760209586044, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 8791572.0, + "repeat_count": 0.0, + "routers_loss": 0.005267909727990627, + "skip_count": 1.0, + "step": 5454, + "text_loss": 0.3015609681606293 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005185667214867937, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 8794697.0, + "repeat_count": 0.0, + "routers_loss": 0.000532392121385783, + "skip_count": 0.0, + "step": 5456, + "text_loss": 0.9596265554428101 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005182574149001805, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 8797880.0, + "repeat_count": 0.0, + "routers_loss": 0.0007176774088293314, + "skip_count": 0.0, + "step": 5458, + "text_loss": 0.5599364638328552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005179481013172912, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 8801995.0, + "repeat_count": 0.0, + "routers_loss": 0.0022756673861294985, + "skip_count": 0.0, + "step": 5460, + "text_loss": 0.47327280044555664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005176387808566558, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 8805138.0, + "repeat_count": 0.0, + "routers_loss": 0.0025084633380174637, + "skip_count": 0.0, + "step": 5462, + "text_loss": 0.26674970984458923 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05078125, + "learning_rate": 0.0005173294536368061, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 8808102.0, + "repeat_count": 0.0, + "routers_loss": 0.0008814680040813982, + "skip_count": 0.0, + "step": 5464, + "text_loss": 0.5981299877166748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005170201197762773, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8811431.0, + "repeat_count": 0.0, + "routers_loss": 0.0005443177651613951, + "skip_count": 0.0, + "step": 5466, + "text_loss": 1.037438988685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0005167107793936065, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 8814256.0, + "repeat_count": 0.0, + "routers_loss": 0.000494555220939219, + "skip_count": 0.0, + "step": 5468, + "text_loss": 0.5005733966827393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005164014326073333, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 8817024.0, + "repeat_count": 0.0, + "routers_loss": 0.004793747793883085, + "skip_count": 2.0, + "step": 5470, + "text_loss": 0.6999614834785461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005160920795360002, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8819892.0, + "repeat_count": 0.0, + "routers_loss": 0.0020966180600225925, + "skip_count": 0.0, + "step": 5472, + "text_loss": 0.5536707043647766 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0005157827202981521, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 8822928.0, + "repeat_count": 0.0, + "routers_loss": 0.0020367507822811604, + "skip_count": 0.0, + "step": 5474, + "text_loss": 0.43655988574028015 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005154733550123356, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8825842.0, + "repeat_count": 0.0, + "routers_loss": 0.0020070383325219154, + "skip_count": 0.0, + "step": 5476, + "text_loss": 0.48149657249450684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0005151639837971004, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 8829534.0, + "repeat_count": 0.0, + "routers_loss": 0.0016327418852597475, + "skip_count": 0.0, + "step": 5478, + "text_loss": 0.6693689227104187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.000514854606770998, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 8833177.0, + "repeat_count": 0.0, + "routers_loss": 0.0012691980227828026, + "skip_count": 0.0, + "step": 5480, + "text_loss": 0.44926801323890686 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0005145452240525822, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 8836933.0, + "repeat_count": 1.0, + "routers_loss": 0.0007724820752628148, + "skip_count": 0.0, + "step": 5482, + "text_loss": 0.5759884119033813 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0005142358357604092, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 8840093.0, + "repeat_count": 1.0, + "routers_loss": 0.008331702090799809, + "skip_count": 7.0, + "step": 5484, + "text_loss": 0.47393685579299927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0005139264420130368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8843918.0, + "repeat_count": 0.0, + "routers_loss": 0.003124477108940482, + "skip_count": 2.0, + "step": 5486, + "text_loss": 0.5298711061477661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005136170429290259, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 8846558.0, + "repeat_count": 0.0, + "routers_loss": 0.0034127775579690933, + "skip_count": 2.0, + "step": 5488, + "text_loss": 0.43582668900489807 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0005133076386269383, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 8849724.0, + "repeat_count": 1.0, + "routers_loss": 0.0018056259723380208, + "skip_count": 0.0, + "step": 5490, + "text_loss": 0.8116800785064697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 25.784267684179632, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0005129982292253384, + "loss": 0.0063, + "macro_f1": 0.6589147448539734, + "num_tokens": 8852447.0, + "repeat_count": 1.0, + "routers_loss": 0.021452350541949272, + "skip_count": 6.0, + "step": 5492, + "text_loss": 0.31878748536109924 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0005126888148427927, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 8855886.0, + "repeat_count": 0.0, + "routers_loss": 0.0026911941822618246, + "skip_count": 0.0, + "step": 5494, + "text_loss": 0.4021807909011841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 25.80305253889052, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005123793955978693, + "loss": 0.007, + "macro_f1": 0.5492662787437439, + "num_tokens": 8859378.0, + "repeat_count": 0.0, + "routers_loss": 0.019764510914683342, + "skip_count": 2.0, + "step": 5496, + "text_loss": 0.21608132123947144 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0005120699716091379, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 8862310.0, + "repeat_count": 0.0, + "routers_loss": 0.0008988190093077719, + "skip_count": 0.0, + "step": 5498, + "text_loss": 0.34666743874549866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0005117605429951707, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 8865166.0, + "repeat_count": 0.0, + "routers_loss": 0.011137975379824638, + "skip_count": 2.0, + "step": 5500, + "text_loss": 0.25385144352912903 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 25.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0005114511098745412, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 8869923.0, + "repeat_count": 1.0, + "routers_loss": 0.006476947572082281, + "skip_count": 4.0, + "step": 5502, + "text_loss": 0.4503856301307678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.000511141672365825, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8872451.0, + "repeat_count": 0.0, + "routers_loss": 0.0022727579344063997, + "skip_count": 0.0, + "step": 5504, + "text_loss": 0.7522464990615845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005108322305875987, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8875968.0, + "repeat_count": 0.0, + "routers_loss": 0.0020014268811792135, + "skip_count": 0.0, + "step": 5506, + "text_loss": 0.30184176564216614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005105227846584414, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 8879705.0, + "repeat_count": 0.0, + "routers_loss": 0.001179999322630465, + "skip_count": 0.0, + "step": 5508, + "text_loss": 0.6187804937362671 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0005102133346969329, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 8883535.0, + "repeat_count": 1.0, + "routers_loss": 0.002946492750197649, + "skip_count": 0.0, + "step": 5510, + "text_loss": 0.5961501002311707 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0005099038808216555, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 8886683.0, + "repeat_count": 1.0, + "routers_loss": 0.004532935563474894, + "skip_count": 3.0, + "step": 5512, + "text_loss": 0.38462957739830017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0005095944231511922, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 8891049.0, + "repeat_count": 0.0, + "routers_loss": 0.00917842984199524, + "skip_count": 2.0, + "step": 5514, + "text_loss": 0.27541956305503845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0005092849618041279, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 8893604.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756510796956718, + "skip_count": 0.0, + "step": 5516, + "text_loss": 0.681315541267395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005089754968990487, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 8898072.0, + "repeat_count": 0.0, + "routers_loss": 0.0008704439387656748, + "skip_count": 1.0, + "step": 5518, + "text_loss": 0.5060005187988281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005086660285545422, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 8901539.0, + "repeat_count": 0.0, + "routers_loss": 0.004750201944261789, + "skip_count": 1.0, + "step": 5520, + "text_loss": 0.6008047461509705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 25.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.000508356556889197, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 8904525.0, + "repeat_count": 0.0, + "routers_loss": 0.0026552649214863777, + "skip_count": 0.0, + "step": 5522, + "text_loss": 0.4539012908935547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 25.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0005080470820216037, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8907624.0, + "repeat_count": 0.0, + "routers_loss": 0.002621029270812869, + "skip_count": 1.0, + "step": 5524, + "text_loss": 0.20088370144367218 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 25.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005077376040703533, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8910515.0, + "repeat_count": 3.0, + "routers_loss": 0.0028921898920089006, + "skip_count": 0.0, + "step": 5526, + "text_loss": 0.6575983166694641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8888888955116272, + "avg_layers": 21.0, + "epoch": 25.953331376577633, + "f1_execute": 0.9729729890823364, + "f1_repeat": 1.0, + "f1_skip": 0.9411765336990356, + "grad_norm": 0.02734375, + "learning_rate": 0.0005074281231540384, + "loss": 0.0076, + "macro_f1": 0.9713832139968872, + "num_tokens": 8914419.0, + "repeat_count": 1.0, + "routers_loss": 0.024232301861047745, + "skip_count": 9.0, + "step": 5528, + "text_loss": 0.5435594916343689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 25.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0005071186393912527, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 8917543.0, + "repeat_count": 0.0, + "routers_loss": 0.003731841454282403, + "skip_count": 2.0, + "step": 5530, + "text_loss": 0.5152071118354797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005068091529005909, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 8920728.0, + "repeat_count": 1.0, + "routers_loss": 0.005905418191105127, + "skip_count": 0.0, + "step": 5532, + "text_loss": 0.29741042852401733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 25.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.000506499663800649, + "loss": 0.0096, + "macro_f1": 0.6666666865348816, + "num_tokens": 8924112.0, + "repeat_count": 1.0, + "routers_loss": 0.0021933517418801785, + "skip_count": 0.0, + "step": 5534, + "text_loss": 0.45704230666160583 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 25.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005061901722100235, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 8927323.0, + "repeat_count": 0.0, + "routers_loss": 0.009227502159774303, + "skip_count": 4.0, + "step": 5536, + "text_loss": 0.1968434453010559 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.0, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005058806782473125, + "loss": 0.0053, + "macro_f1": 0.6601307392120361, + "num_tokens": 8931052.0, + "repeat_count": 1.0, + "routers_loss": 0.02054760232567787, + "skip_count": 2.0, + "step": 5538, + "text_loss": 0.23851273953914642 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0005055711820311144, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8934215.0, + "repeat_count": 0.0, + "routers_loss": 0.0008434011251665652, + "skip_count": 0.0, + "step": 5540, + "text_loss": 0.85942542552948 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0005052616836800288, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 8937173.0, + "repeat_count": 0.0, + "routers_loss": 0.011105241253972054, + "skip_count": 4.0, + "step": 5542, + "text_loss": 0.2614556849002838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0005049521833126561, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8940553.0, + "repeat_count": 0.0, + "routers_loss": 0.0006273435428738594, + "skip_count": 0.0, + "step": 5544, + "text_loss": 0.6430498957633972 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0005046426810475976, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 8943753.0, + "repeat_count": 0.0, + "routers_loss": 0.0023464353289455175, + "skip_count": 1.0, + "step": 5546, + "text_loss": 0.7015808820724487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06689453125, + "learning_rate": 0.0005043331770034547, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 8947149.0, + "repeat_count": 0.0, + "routers_loss": 0.0016024730866774917, + "skip_count": 1.0, + "step": 5548, + "text_loss": 0.5875257253646851 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005040236712988304, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 8950374.0, + "repeat_count": 0.0, + "routers_loss": 0.004096277989447117, + "skip_count": 0.0, + "step": 5550, + "text_loss": 0.1712338626384735 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0005037141640523275, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 8953256.0, + "repeat_count": 1.0, + "routers_loss": 0.00441550649702549, + "skip_count": 0.0, + "step": 5552, + "text_loss": 0.16560404002666473 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0005034046553825501, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 8956845.0, + "repeat_count": 4.0, + "routers_loss": 0.011712636798620224, + "skip_count": 6.0, + "step": 5554, + "text_loss": 0.24278216063976288 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005030951454081023, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 8961165.0, + "repeat_count": 0.0, + "routers_loss": 0.00235542468726635, + "skip_count": 1.0, + "step": 5556, + "text_loss": 0.17214511334896088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.093924273554446, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0005027856342475888, + "loss": 0.0037, + "macro_f1": 0.3272727429866791, + "num_tokens": 8965262.0, + "repeat_count": 0.0, + "routers_loss": 0.0160827673971653, + "skip_count": 1.0, + "step": 5558, + "text_loss": 0.40229740738868713 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005024761220196151, + "loss": 0.0091, + "macro_f1": 0.6666666865348816, + "num_tokens": 8968278.0, + "repeat_count": 1.0, + "routers_loss": 0.004786997567862272, + "skip_count": 0.0, + "step": 5560, + "text_loss": 0.24828575551509857 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005021666088427868, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 8971443.0, + "repeat_count": 1.0, + "routers_loss": 0.0015378865646198392, + "skip_count": 0.0, + "step": 5562, + "text_loss": 0.7269657254219055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0005018570948357099, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 8975312.0, + "repeat_count": 0.0, + "routers_loss": 0.0015218508196994662, + "skip_count": 0.0, + "step": 5564, + "text_loss": 0.5198811292648315 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0005015475801169908, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 8977951.0, + "repeat_count": 0.0, + "routers_loss": 0.008865317329764366, + "skip_count": 1.0, + "step": 5566, + "text_loss": 0.1541406810283661 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005012380648052359, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 8981325.0, + "repeat_count": 1.0, + "routers_loss": 0.0055318837985396385, + "skip_count": 0.0, + "step": 5568, + "text_loss": 0.510314404964447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0005009285490190523, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 8984661.0, + "repeat_count": 0.0, + "routers_loss": 0.0035060355439782143, + "skip_count": 0.0, + "step": 5570, + "text_loss": 0.29421761631965637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000500619032877047, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 8987573.0, + "repeat_count": 0.0, + "routers_loss": 0.0050126477144658566, + "skip_count": 2.0, + "step": 5572, + "text_loss": 0.1984361708164215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0005003095164978271, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 8991136.0, + "repeat_count": 0.0, + "routers_loss": 0.0019407360814511776, + "skip_count": 0.0, + "step": 5574, + "text_loss": 0.42751404643058777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0005, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 8994198.0, + "repeat_count": 0.0, + "routers_loss": 0.0029819176997989416, + "skip_count": 2.0, + "step": 5576, + "text_loss": 0.20589640736579895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004996904835021729, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 8997907.0, + "repeat_count": 0.0, + "routers_loss": 0.000878945691511035, + "skip_count": 1.0, + "step": 5578, + "text_loss": 0.2801406979560852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000499380967122953, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9001141.0, + "repeat_count": 0.0, + "routers_loss": 0.005223734769970179, + "skip_count": 1.0, + "step": 5580, + "text_loss": 0.20542480051517487 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004990714509809478, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9004794.0, + "repeat_count": 0.0, + "routers_loss": 0.0015868612099438906, + "skip_count": 0.0, + "step": 5582, + "text_loss": 0.32094934582710266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 26.216025829175226, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004987619351947643, + "loss": 0.0064, + "macro_f1": 0.6122449040412903, + "num_tokens": 9009250.0, + "repeat_count": 0.0, + "routers_loss": 0.031923454254865646, + "skip_count": 4.0, + "step": 5584, + "text_loss": 0.609201967716217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004984524198830095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9013254.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124545589089394, + "skip_count": 0.0, + "step": 5586, + "text_loss": 0.3698650300502777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004981429051642903, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9016598.0, + "repeat_count": 0.0, + "routers_loss": 0.0017190382350236177, + "skip_count": 1.0, + "step": 5588, + "text_loss": 0.5306026935577393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.24420311124156, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004978333911572132, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 9019558.0, + "repeat_count": 0.0, + "routers_loss": 0.02051064372062683, + "skip_count": 1.0, + "step": 5590, + "text_loss": 0.23494470119476318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0004975238779803849, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9023024.0, + "repeat_count": 0.0, + "routers_loss": 0.0010489600244909525, + "skip_count": 0.0, + "step": 5592, + "text_loss": 0.579275906085968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004972143657524112, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9026161.0, + "repeat_count": 0.0, + "routers_loss": 0.0012039231369271874, + "skip_count": 0.0, + "step": 5594, + "text_loss": 0.5776295065879822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0004969048545918978, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9028814.0, + "repeat_count": 0.0, + "routers_loss": 0.0010212450288236141, + "skip_count": 1.0, + "step": 5596, + "text_loss": 0.6816855669021606 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 26.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00049659534461745, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9032243.0, + "repeat_count": 2.0, + "routers_loss": 0.0024297661148011684, + "skip_count": 0.0, + "step": 5598, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0004962858359476726, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9035493.0, + "repeat_count": 0.0, + "routers_loss": 0.002151754219084978, + "skip_count": 0.0, + "step": 5600, + "text_loss": 0.5213983654975891 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004959763287011698, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 9038213.0, + "repeat_count": 0.0, + "routers_loss": 0.0028108188416808844, + "skip_count": 2.0, + "step": 5602, + "text_loss": 0.5128397345542908 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004956668229965454, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9041152.0, + "repeat_count": 0.0, + "routers_loss": 0.004022551700472832, + "skip_count": 2.0, + "step": 5604, + "text_loss": 0.15361636877059937 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004953573189524026, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9044503.0, + "repeat_count": 0.0, + "routers_loss": 0.0010689410846680403, + "skip_count": 1.0, + "step": 5606, + "text_loss": 0.6454885005950928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004950478166873439, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 9047742.0, + "repeat_count": 0.0, + "routers_loss": 0.0025760293938219547, + "skip_count": 0.0, + "step": 5608, + "text_loss": 0.7654000520706177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0004947383163199713, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 9050349.0, + "repeat_count": 0.0, + "routers_loss": 0.0009846165776252747, + "skip_count": 0.0, + "step": 5610, + "text_loss": 0.41533342003822327 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0004944288179688858, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 9053667.0, + "repeat_count": 0.0, + "routers_loss": 0.0017193946987390518, + "skip_count": 1.0, + "step": 5612, + "text_loss": 1.0172475576400757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004941193217526875, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9056777.0, + "repeat_count": 0.0, + "routers_loss": 0.0026750199031084776, + "skip_count": 0.0, + "step": 5614, + "text_loss": 0.17584927380084991 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 26.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004938098277899765, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9060609.0, + "repeat_count": 1.0, + "routers_loss": 0.005259076599031687, + "skip_count": 1.0, + "step": 5616, + "text_loss": 0.5522297024726868 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004935003361993511, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9063633.0, + "repeat_count": 0.0, + "routers_loss": 0.0006837095716036856, + "skip_count": 0.0, + "step": 5618, + "text_loss": 0.5212588310241699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.38508952157323, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0004931908470994091, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9067777.0, + "repeat_count": 1.0, + "routers_loss": 0.01067375484853983, + "skip_count": 1.0, + "step": 5620, + "text_loss": 0.5515062808990479 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 26.394481948928675, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.019775390625, + "learning_rate": 0.0004928813606087474, + "loss": 0.0043, + "macro_f1": 0.5934640765190125, + "num_tokens": 9070938.0, + "repeat_count": 0.0, + "routers_loss": 0.016635602340102196, + "skip_count": 3.0, + "step": 5622, + "text_loss": 0.3225076198577881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004925718768459617, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9074050.0, + "repeat_count": 0.0, + "routers_loss": 0.002216119086369872, + "skip_count": 0.0, + "step": 5624, + "text_loss": 0.32438889145851135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004922623959296469, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 9076785.0, + "repeat_count": 1.0, + "routers_loss": 0.012125075794756413, + "skip_count": 5.0, + "step": 5626, + "text_loss": 0.39563658833503723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004919529179783965, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9080239.0, + "repeat_count": 0.0, + "routers_loss": 0.0026486809365451336, + "skip_count": 0.0, + "step": 5628, + "text_loss": 0.5401569604873657 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0004916434431108031, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9083935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011849761940538883, + "skip_count": 0.0, + "step": 5630, + "text_loss": 0.4798774719238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.000491333971445458, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9087174.0, + "repeat_count": 0.0, + "routers_loss": 0.002799210138618946, + "skip_count": 0.0, + "step": 5632, + "text_loss": 0.22488386929035187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004910245031009515, + "loss": 0.0096, + "macro_f1": 0.3333333432674408, + "num_tokens": 9089803.0, + "repeat_count": 0.0, + "routers_loss": 0.00139117450453341, + "skip_count": 0.0, + "step": 5634, + "text_loss": 0.6237335205078125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0004907150381958723, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9093075.0, + "repeat_count": 0.0, + "routers_loss": 0.006503603886812925, + "skip_count": 1.0, + "step": 5636, + "text_loss": 0.18781614303588867 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0004904055768488077, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9096355.0, + "repeat_count": 0.0, + "routers_loss": 0.0009764843271113932, + "skip_count": 0.0, + "step": 5638, + "text_loss": 0.6821450591087341 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004900961191783445, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9098994.0, + "repeat_count": 1.0, + "routers_loss": 0.00693159457296133, + "skip_count": 3.0, + "step": 5640, + "text_loss": 0.214790940284729 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0004897866653030671, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9102048.0, + "repeat_count": 0.0, + "routers_loss": 0.002469591563567519, + "skip_count": 0.0, + "step": 5642, + "text_loss": 0.1556607335805893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004894772153415588, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9105379.0, + "repeat_count": 0.0, + "routers_loss": 0.0004824921488761902, + "skip_count": 0.0, + "step": 5644, + "text_loss": 0.499972403049469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004891677694124013, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9108240.0, + "repeat_count": 0.0, + "routers_loss": 0.0029356612358242273, + "skip_count": 1.0, + "step": 5646, + "text_loss": 0.5169754028320312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0004888583276341751, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 9111381.0, + "repeat_count": 0.0, + "routers_loss": 0.009489183314144611, + "skip_count": 1.0, + "step": 5648, + "text_loss": 0.23630797863006592 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.0004885488901254588, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9114015.0, + "repeat_count": 0.0, + "routers_loss": 0.004154495894908905, + "skip_count": 1.0, + "step": 5650, + "text_loss": 0.3345947563648224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0546875, + "learning_rate": 0.0004882394570048294, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9117044.0, + "repeat_count": 0.0, + "routers_loss": 0.0018865863094106317, + "skip_count": 0.0, + "step": 5652, + "text_loss": 0.32814112305641174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.0004879300283908623, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9120035.0, + "repeat_count": 0.0, + "routers_loss": 0.0035278978757560253, + "skip_count": 1.0, + "step": 5654, + "text_loss": 0.4081386625766754 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00048762060440213096, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9122955.0, + "repeat_count": 1.0, + "routers_loss": 0.0053498269990086555, + "skip_count": 0.0, + "step": 5656, + "text_loss": 0.31027838587760925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004873111851572075, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9125635.0, + "repeat_count": 0.0, + "routers_loss": 0.004556098487228155, + "skip_count": 0.0, + "step": 5658, + "text_loss": 0.25703540444374084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004870017707746617, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 9128906.0, + "repeat_count": 0.0, + "routers_loss": 0.0031165245454758406, + "skip_count": 2.0, + "step": 5660, + "text_loss": 0.20663656294345856 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004866923613730617, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 9132030.0, + "repeat_count": 1.0, + "routers_loss": 0.004887583665549755, + "skip_count": 2.0, + "step": 5662, + "text_loss": 0.6062649488449097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004863829570709741, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 9135274.0, + "repeat_count": 0.0, + "routers_loss": 0.0021857863757759333, + "skip_count": 0.0, + "step": 5664, + "text_loss": 0.49644309282302856 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.601115350748458, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004860735579869631, + "loss": 0.0088, + "macro_f1": 0.925203263759613, + "num_tokens": 9139735.0, + "repeat_count": 3.0, + "routers_loss": 0.05413912236690521, + "skip_count": 5.0, + "step": 5666, + "text_loss": 0.25161290168762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00048576416423959097, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9142419.0, + "repeat_count": 0.0, + "routers_loss": 0.002229376696050167, + "skip_count": 0.0, + "step": 5668, + "text_loss": 0.5332949161529541 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004854547759474179, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9145443.0, + "repeat_count": 1.0, + "routers_loss": 0.005968933925032616, + "skip_count": 4.0, + "step": 5670, + "text_loss": 0.5282154083251953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.629292632814792, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0004851453932290021, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 9147754.0, + "repeat_count": 0.0, + "routers_loss": 0.04015754163265228, + "skip_count": 1.0, + "step": 5672, + "text_loss": 0.8564629554748535 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.63868506017024, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00048483601620289974, + "loss": 0.0058, + "macro_f1": 0.8820862174034119, + "num_tokens": 9151714.0, + "repeat_count": 2.0, + "routers_loss": 0.019172413274645805, + "skip_count": 2.0, + "step": 5674, + "text_loss": 0.4149441123008728 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 26.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004845266449876645, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9154524.0, + "repeat_count": 1.0, + "routers_loss": 0.005025535821914673, + "skip_count": 0.0, + "step": 5676, + "text_loss": 0.26525792479515076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.000484217279701848, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9158546.0, + "repeat_count": 0.0, + "routers_loss": 0.0012200147612020373, + "skip_count": 0.0, + "step": 5678, + "text_loss": 0.5532271862030029 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004839079204639998, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9161003.0, + "repeat_count": 0.0, + "routers_loss": 0.0013485675444826484, + "skip_count": 1.0, + "step": 5680, + "text_loss": 0.36826151609420776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0004835985673926668, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9164741.0, + "repeat_count": 0.0, + "routers_loss": 0.00532014574855566, + "skip_count": 2.0, + "step": 5682, + "text_loss": 0.16154609620571136 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0004832892206063938, + "loss": 0.0075, + "macro_f1": 1.0, + "num_tokens": 9168079.0, + "repeat_count": 2.0, + "routers_loss": 0.007782323285937309, + "skip_count": 3.0, + "step": 5684, + "text_loss": 0.4323575496673584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0004829798802237228, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9171352.0, + "repeat_count": 0.0, + "routers_loss": 0.0024159469176083803, + "skip_count": 2.0, + "step": 5686, + "text_loss": 0.3163119852542877 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.000482670546363194, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9175197.0, + "repeat_count": 0.0, + "routers_loss": 0.002455134643241763, + "skip_count": 0.0, + "step": 5688, + "text_loss": 0.59735506772995 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.713824479013795, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004823612191433443, + "loss": 0.0042, + "macro_f1": 0.8820862174034119, + "num_tokens": 9177648.0, + "repeat_count": 2.0, + "routers_loss": 0.015524548478424549, + "skip_count": 2.0, + "step": 5690, + "text_loss": 0.759812593460083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048205189868270887, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9180694.0, + "repeat_count": 0.0, + "routers_loss": 0.002112736226990819, + "skip_count": 2.0, + "step": 5692, + "text_loss": 0.3516882061958313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 26.732609333724685, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.025146484375, + "learning_rate": 0.00048174258509981973, + "loss": 0.0063, + "macro_f1": 0.9262410998344421, + "num_tokens": 9183502.0, + "repeat_count": 2.0, + "routers_loss": 0.03100527822971344, + "skip_count": 3.0, + "step": 5694, + "text_loss": 0.3722715973854065 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004814332785132064, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9186417.0, + "repeat_count": 0.0, + "routers_loss": 0.009176591411232948, + "skip_count": 2.0, + "step": 5696, + "text_loss": 0.33363673090934753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.751394188435572, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004811239790413958, + "loss": 0.0076, + "macro_f1": 0.3272727429866791, + "num_tokens": 9189478.0, + "repeat_count": 0.0, + "routers_loss": 0.023586507886648178, + "skip_count": 1.0, + "step": 5698, + "text_loss": 0.19698107242584229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00048081468680291194, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9192115.0, + "repeat_count": 0.0, + "routers_loss": 0.005083440337330103, + "skip_count": 1.0, + "step": 5700, + "text_loss": 0.3476336896419525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004805054019162764, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9195176.0, + "repeat_count": 0.0, + "routers_loss": 0.007766073569655418, + "skip_count": 1.0, + "step": 5702, + "text_loss": 0.27114811539649963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0004801961245000076, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9199091.0, + "repeat_count": 0.0, + "routers_loss": 0.0009058842551894486, + "skip_count": 0.0, + "step": 5704, + "text_loss": 0.6249846816062927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004798868546726212, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9202003.0, + "repeat_count": 0.0, + "routers_loss": 0.005479823332279921, + "skip_count": 0.0, + "step": 5706, + "text_loss": 0.47223609685897827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0166015625, + "learning_rate": 0.00047957759255263014, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9205277.0, + "repeat_count": 0.0, + "routers_loss": 0.001055705244652927, + "skip_count": 0.0, + "step": 5708, + "text_loss": 0.677215576171875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047926833825854377, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9208844.0, + "repeat_count": 0.0, + "routers_loss": 0.003291431115940213, + "skip_count": 2.0, + "step": 5710, + "text_loss": 0.12439999729394913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004789590919088696, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 9211619.0, + "repeat_count": 0.0, + "routers_loss": 0.005120242480188608, + "skip_count": 2.0, + "step": 5712, + "text_loss": 0.5771954655647278 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004786498536221111, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 9214914.0, + "repeat_count": 1.0, + "routers_loss": 0.004877795465290546, + "skip_count": 2.0, + "step": 5714, + "text_loss": 0.6432198882102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00047834062351676893, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9218186.0, + "repeat_count": 0.0, + "routers_loss": 0.0026507999282330275, + "skip_count": 0.0, + "step": 5716, + "text_loss": 0.23814935982227325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00047803140171134075, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9221754.0, + "repeat_count": 0.0, + "routers_loss": 0.002605629386380315, + "skip_count": 1.0, + "step": 5718, + "text_loss": 0.2910388708114624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 26.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0004777221883243208, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9224502.0, + "repeat_count": 0.0, + "routers_loss": 0.0048494706861674786, + "skip_count": 3.0, + "step": 5720, + "text_loss": 0.6195104122161865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004774129834742004, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 9227350.0, + "repeat_count": 0.0, + "routers_loss": 0.003092368133366108, + "skip_count": 0.0, + "step": 5722, + "text_loss": 0.35447990894317627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00047710378727946725, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9230166.0, + "repeat_count": 0.0, + "routers_loss": 0.012780336663126945, + "skip_count": 2.0, + "step": 5724, + "text_loss": 0.27581867575645447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00047679459985860604, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9233029.0, + "repeat_count": 0.0, + "routers_loss": 0.005429140292108059, + "skip_count": 1.0, + "step": 5726, + "text_loss": 0.2636827826499939 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00047648542133009794, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9236317.0, + "repeat_count": 0.0, + "routers_loss": 0.0023909916635602713, + "skip_count": 0.0, + "step": 5728, + "text_loss": 0.4801979064941406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00047617625181242077, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9239796.0, + "repeat_count": 0.0, + "routers_loss": 0.003603481687605381, + "skip_count": 0.0, + "step": 5730, + "text_loss": 0.8374754786491394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0004758670914240488, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9243489.0, + "repeat_count": 0.0, + "routers_loss": 0.004478964954614639, + "skip_count": 2.0, + "step": 5732, + "text_loss": 0.3870154917240143 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000475557940283453, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9246758.0, + "repeat_count": 0.0, + "routers_loss": 0.00312575395219028, + "skip_count": 1.0, + "step": 5734, + "text_loss": 0.42341071367263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 26.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00047524879850910026, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9250053.0, + "repeat_count": 0.0, + "routers_loss": 0.010855631902813911, + "skip_count": 4.0, + "step": 5736, + "text_loss": 0.25729796290397644 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 26.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004749396662194549, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9253691.0, + "repeat_count": 0.0, + "routers_loss": 0.0009250419097952545, + "skip_count": 0.0, + "step": 5738, + "text_loss": 0.6151770949363708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004746305435329767, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 9256866.0, + "repeat_count": 1.0, + "routers_loss": 0.007521102204918861, + "skip_count": 3.0, + "step": 5740, + "text_loss": 0.3094986379146576 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004743214305681221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9259790.0, + "repeat_count": 0.0, + "routers_loss": 0.0022241887636482716, + "skip_count": 1.0, + "step": 5742, + "text_loss": 0.5418204069137573 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 26.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00047401232744334376, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9263205.0, + "repeat_count": 1.0, + "routers_loss": 0.008611299097537994, + "skip_count": 2.0, + "step": 5744, + "text_loss": 0.35824623703956604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 26.976812444966246, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004737032342770906, + "loss": 0.0062, + "macro_f1": 0.5492662787437439, + "num_tokens": 9266126.0, + "repeat_count": 0.0, + "routers_loss": 0.010788857005536556, + "skip_count": 2.0, + "step": 5746, + "text_loss": 0.2172674983739853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004733941511878074, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9269308.0, + "repeat_count": 0.0, + "routers_loss": 0.005309196189045906, + "skip_count": 2.0, + "step": 5748, + "text_loss": 0.1696814000606537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 26.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00047308507829393594, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 9272801.0, + "repeat_count": 0.0, + "routers_loss": 0.009940510615706444, + "skip_count": 2.0, + "step": 5750, + "text_loss": 0.24295592308044434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00047277601571391314, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9276197.0, + "repeat_count": 0.0, + "routers_loss": 0.000687236781232059, + "skip_count": 0.0, + "step": 5752, + "text_loss": 0.8511804342269897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.014088641033165, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00047246696356617254, + "loss": 0.0059, + "macro_f1": 0.6603773832321167, + "num_tokens": 9278965.0, + "repeat_count": 1.0, + "routers_loss": 0.009816894307732582, + "skip_count": 1.0, + "step": 5754, + "text_loss": 0.45420053601264954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0004721579219691434, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9282076.0, + "repeat_count": 0.0, + "routers_loss": 0.0015747188590466976, + "skip_count": 0.0, + "step": 5756, + "text_loss": 0.21671754121780396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004718488910412511, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9285465.0, + "repeat_count": 0.0, + "routers_loss": 0.008654040284454823, + "skip_count": 2.0, + "step": 5758, + "text_loss": 0.25920194387435913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00047153987090091674, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9288156.0, + "repeat_count": 0.0, + "routers_loss": 0.0011430777376517653, + "skip_count": 0.0, + "step": 5760, + "text_loss": 0.7655444741249084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004712308616665576, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9291529.0, + "repeat_count": 0.0, + "routers_loss": 0.003674200503155589, + "skip_count": 2.0, + "step": 5762, + "text_loss": 0.269486665725708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004709218634565866, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9294699.0, + "repeat_count": 0.0, + "routers_loss": 0.003249827306717634, + "skip_count": 1.0, + "step": 5764, + "text_loss": 0.5073734521865845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00047061287638941235, + "loss": 0.0068, + "macro_f1": 1.0, + "num_tokens": 9297863.0, + "repeat_count": 1.0, + "routers_loss": 0.002763139782473445, + "skip_count": 2.0, + "step": 5766, + "text_loss": 0.2572014033794403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00047030390058343935, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9301124.0, + "repeat_count": 0.0, + "routers_loss": 0.007100266870111227, + "skip_count": 3.0, + "step": 5768, + "text_loss": 0.4147387742996216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0004699949361570676, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 9304330.0, + "repeat_count": 0.0, + "routers_loss": 0.005467240232974291, + "skip_count": 1.0, + "step": 5770, + "text_loss": 0.21510964632034302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.000469685983228693, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9306882.0, + "repeat_count": 0.0, + "routers_loss": 0.003167890477925539, + "skip_count": 0.0, + "step": 5772, + "text_loss": 0.45717427134513855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.108012914587615, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00046937704191670675, + "loss": 0.0057, + "macro_f1": 0.6601307392120361, + "num_tokens": 9309767.0, + "repeat_count": 1.0, + "routers_loss": 0.014881107024848461, + "skip_count": 2.0, + "step": 5774, + "text_loss": 0.3464985191822052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004690681123394959, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9313045.0, + "repeat_count": 0.0, + "routers_loss": 0.00379011663608253, + "skip_count": 2.0, + "step": 5776, + "text_loss": 0.33194616436958313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00046875919461544265, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 9315736.0, + "repeat_count": 0.0, + "routers_loss": 0.0016733441734686494, + "skip_count": 0.0, + "step": 5778, + "text_loss": 0.5009998679161072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00046845028886292493, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9318456.0, + "repeat_count": 0.0, + "routers_loss": 0.005318894516676664, + "skip_count": 1.0, + "step": 5780, + "text_loss": 0.17702752351760864 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.145582624009393, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.044921875, + "learning_rate": 0.00046814139520031615, + "loss": 0.006, + "macro_f1": 0.8820862174034119, + "num_tokens": 9323152.0, + "repeat_count": 2.0, + "routers_loss": 0.01133672520518303, + "skip_count": 2.0, + "step": 5782, + "text_loss": 0.2886650860309601 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004678325137459845, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9326318.0, + "repeat_count": 0.0, + "routers_loss": 0.002458433620631695, + "skip_count": 0.0, + "step": 5784, + "text_loss": 0.5832745432853699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004675236446182946, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9329779.0, + "repeat_count": 0.0, + "routers_loss": 0.0005402310052886605, + "skip_count": 0.0, + "step": 5786, + "text_loss": 0.5699237585067749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046721478793560525, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 9333360.0, + "repeat_count": 0.0, + "routers_loss": 0.0002638917067088187, + "skip_count": 0.0, + "step": 5788, + "text_loss": 0.6555714011192322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00046690594381627106, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9336498.0, + "repeat_count": 0.0, + "routers_loss": 0.003998351749032736, + "skip_count": 2.0, + "step": 5790, + "text_loss": 0.2076750248670578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00046659711237864157, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9339724.0, + "repeat_count": 0.0, + "routers_loss": 0.0045847659930586815, + "skip_count": 1.0, + "step": 5792, + "text_loss": 0.22027169167995453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00046628829374106167, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9342835.0, + "repeat_count": 0.0, + "routers_loss": 0.0014064523857086897, + "skip_count": 1.0, + "step": 5794, + "text_loss": 0.5120179057121277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004659794880218712, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9346757.0, + "repeat_count": 0.0, + "routers_loss": 0.0011155207175761461, + "skip_count": 1.0, + "step": 5796, + "text_loss": 0.6415372490882874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004656706953394051, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 9349652.0, + "repeat_count": 0.0, + "routers_loss": 0.0020385095849633217, + "skip_count": 0.0, + "step": 5798, + "text_loss": 0.5410398840904236 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0004653619158119933, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9354286.0, + "repeat_count": 1.0, + "routers_loss": 0.0012847178149968386, + "skip_count": 0.0, + "step": 5800, + "text_loss": 0.4386860728263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00046505314955796074, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9357682.0, + "repeat_count": 0.0, + "routers_loss": 0.0035008061677217484, + "skip_count": 2.0, + "step": 5802, + "text_loss": 0.13655950129032135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00046474439669562715, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9361058.0, + "repeat_count": 0.0, + "routers_loss": 0.0020033426117151976, + "skip_count": 1.0, + "step": 5804, + "text_loss": 0.6293444037437439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00046443565734330714, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9364173.0, + "repeat_count": 0.0, + "routers_loss": 0.0004935986362397671, + "skip_count": 0.0, + "step": 5806, + "text_loss": 0.2923166751861572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004641269316193104, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9366980.0, + "repeat_count": 0.0, + "routers_loss": 0.001654456602409482, + "skip_count": 0.0, + "step": 5808, + "text_loss": 0.7273373007774353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0004638182196419411, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9370581.0, + "repeat_count": 0.0, + "routers_loss": 0.0017011919990181923, + "skip_count": 0.0, + "step": 5810, + "text_loss": 0.6029995083808899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.286469034341064, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004635095215294984, + "loss": 0.0072, + "macro_f1": 0.9265305995941162, + "num_tokens": 9374233.0, + "repeat_count": 1.0, + "routers_loss": 0.01361197978258133, + "skip_count": 3.0, + "step": 5812, + "text_loss": 0.14051523804664612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00046320083740027584, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9377217.0, + "repeat_count": 0.0, + "routers_loss": 0.004597014281898737, + "skip_count": 0.0, + "step": 5814, + "text_loss": 0.2766880691051483 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 27.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00046289216737256184, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 9380336.0, + "repeat_count": 3.0, + "routers_loss": 0.006628422066569328, + "skip_count": 1.0, + "step": 5816, + "text_loss": 0.8092381954193115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.0004625835115646393, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9382968.0, + "repeat_count": 0.0, + "routers_loss": 0.002737772185355425, + "skip_count": 0.0, + "step": 5818, + "text_loss": 0.22090643644332886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004622748700947856, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 9386203.0, + "repeat_count": 1.0, + "routers_loss": 0.004552177153527737, + "skip_count": 1.0, + "step": 5820, + "text_loss": 0.42869850993156433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0004619662430812729, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9388968.0, + "repeat_count": 0.0, + "routers_loss": 0.003149240743368864, + "skip_count": 2.0, + "step": 5822, + "text_loss": 0.45137661695480347 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004616576306423677, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 9392487.0, + "repeat_count": 0.0, + "routers_loss": 0.0008133690571412444, + "skip_count": 0.0, + "step": 5824, + "text_loss": 0.638685941696167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004613490328963307, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 9395665.0, + "repeat_count": 0.0, + "routers_loss": 0.00042717234464362264, + "skip_count": 0.0, + "step": 5826, + "text_loss": 0.8134317398071289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00046104044996141716, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 9398831.0, + "repeat_count": 0.0, + "routers_loss": 0.0084775285795331, + "skip_count": 2.0, + "step": 5828, + "text_loss": 0.19263958930969238 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004607318819558768, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 9403118.0, + "repeat_count": 1.0, + "routers_loss": 0.0030239911284297705, + "skip_count": 0.0, + "step": 5830, + "text_loss": 0.45556432008743286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 27.38039330789551, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0458984375, + "learning_rate": 0.00046042332899795313, + "loss": 0.0075, + "macro_f1": 0.5492662787437439, + "num_tokens": 9406206.0, + "repeat_count": 0.0, + "routers_loss": 0.026389889419078827, + "skip_count": 2.0, + "step": 5832, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0004601147912058845, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9409806.0, + "repeat_count": 0.0, + "routers_loss": 0.0013476534513756633, + "skip_count": 0.0, + "step": 5834, + "text_loss": 0.7443689107894897 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0004598062686979033, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9412737.0, + "repeat_count": 0.0, + "routers_loss": 0.004275512881577015, + "skip_count": 1.0, + "step": 5836, + "text_loss": 0.2808683514595032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00045949776159223563, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9415818.0, + "repeat_count": 0.0, + "routers_loss": 0.0027225434314459562, + "skip_count": 0.0, + "step": 5838, + "text_loss": 0.6283587217330933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.0004591892700071022, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 9419119.0, + "repeat_count": 1.0, + "routers_loss": 0.01574302278459072, + "skip_count": 2.0, + "step": 5840, + "text_loss": 0.33239027857780457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045888079406071746, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 9422257.0, + "repeat_count": 0.0, + "routers_loss": 0.0007227854221127927, + "skip_count": 0.0, + "step": 5842, + "text_loss": 0.6658740043640137 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00045857233387129, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9425071.0, + "repeat_count": 0.0, + "routers_loss": 0.0020696306601166725, + "skip_count": 2.0, + "step": 5844, + "text_loss": 0.5773820877075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004582638895570224, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9427980.0, + "repeat_count": 0.0, + "routers_loss": 0.0019764541648328304, + "skip_count": 0.0, + "step": 5846, + "text_loss": 0.3388919532299042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.455532726739065, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.000457955461236111, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9430733.0, + "repeat_count": 1.0, + "routers_loss": 0.04235004261136055, + "skip_count": 0.0, + "step": 5848, + "text_loss": 0.44346582889556885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004576470490267462, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 9433347.0, + "repeat_count": 0.0, + "routers_loss": 0.000801609072368592, + "skip_count": 0.0, + "step": 5850, + "text_loss": 0.5825944542884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0004573386530471121, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 9436172.0, + "repeat_count": 0.0, + "routers_loss": 0.0018224078230559826, + "skip_count": 2.0, + "step": 5852, + "text_loss": 0.8111652135848999 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004570302734153866, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9439040.0, + "repeat_count": 0.0, + "routers_loss": 0.006614950485527515, + "skip_count": 2.0, + "step": 5854, + "text_loss": 0.31270334124565125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05859375, + "learning_rate": 0.0004567219102497412, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9442138.0, + "repeat_count": 0.0, + "routers_loss": 0.0012984242057427764, + "skip_count": 0.0, + "step": 5856, + "text_loss": 0.6126856803894043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004564135636683416, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9445600.0, + "repeat_count": 0.0, + "routers_loss": 0.0008388847345486283, + "skip_count": 0.0, + "step": 5858, + "text_loss": 0.8526380658149719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0004561052337893467, + "loss": 0.0108, + "macro_f1": 0.6666666865348816, + "num_tokens": 9449609.0, + "repeat_count": 0.0, + "routers_loss": 0.008125773631036282, + "skip_count": 2.0, + "step": 5860, + "text_loss": 0.2843833863735199 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000455796920730909, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9452756.0, + "repeat_count": 0.0, + "routers_loss": 0.0019371749367564917, + "skip_count": 0.0, + "step": 5862, + "text_loss": 0.5293750166893005 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0004554886246111746, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9455467.0, + "repeat_count": 1.0, + "routers_loss": 0.005594742484390736, + "skip_count": 2.0, + "step": 5864, + "text_loss": 0.572329044342041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004551803455482833, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 9458953.0, + "repeat_count": 0.0, + "routers_loss": 0.005960086826235056, + "skip_count": 3.0, + "step": 5866, + "text_loss": 0.19459208846092224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00045487208366036807, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 9462130.0, + "repeat_count": 0.0, + "routers_loss": 0.0034781871363520622, + "skip_count": 1.0, + "step": 5868, + "text_loss": 0.20467053353786469 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00045456383906555554, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9465590.0, + "repeat_count": 0.0, + "routers_loss": 0.0012246103724464774, + "skip_count": 0.0, + "step": 5870, + "text_loss": 0.6086251735687256 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00045425561188196565, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9468092.0, + "repeat_count": 0.0, + "routers_loss": 0.002874316181987524, + "skip_count": 1.0, + "step": 5872, + "text_loss": 0.3430633544921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004539474022277115, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9471433.0, + "repeat_count": 0.0, + "routers_loss": 0.004340244457125664, + "skip_count": 2.0, + "step": 5874, + "text_loss": 0.28219133615493774 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0004536392102208997, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9474363.0, + "repeat_count": 0.0, + "routers_loss": 0.0007322742021642625, + "skip_count": 0.0, + "step": 5876, + "text_loss": 0.7305856943130493 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0004533310359796299, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9478469.0, + "repeat_count": 0.0, + "routers_loss": 0.0018631393322721124, + "skip_count": 0.0, + "step": 5878, + "text_loss": 0.5821442604064941 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 27.60581156442618, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0004530228796219952, + "loss": 0.0088, + "macro_f1": 0.9262410998344421, + "num_tokens": 9481200.0, + "repeat_count": 2.0, + "routers_loss": 0.026109615340828896, + "skip_count": 3.0, + "step": 5880, + "text_loss": 0.3962891101837158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00045271474126608167, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9484200.0, + "repeat_count": 0.0, + "routers_loss": 0.0004716445691883564, + "skip_count": 0.0, + "step": 5882, + "text_loss": 0.31901776790618896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004524066210299685, + "loss": 0.0089, + "macro_f1": 0.3333333432674408, + "num_tokens": 9488939.0, + "repeat_count": 0.0, + "routers_loss": 0.0003797562967520207, + "skip_count": 0.0, + "step": 5884, + "text_loss": 0.3992912471294403 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004520985190317279, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 9492010.0, + "repeat_count": 0.0, + "routers_loss": 0.005681614391505718, + "skip_count": 1.0, + "step": 5886, + "text_loss": 0.5318995118141174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0004517904353894253, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9494770.0, + "repeat_count": 0.0, + "routers_loss": 0.0021422000136226416, + "skip_count": 0.0, + "step": 5888, + "text_loss": 0.435088187456131 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.652773701203404, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.0004514823702211187, + "loss": 0.0052, + "macro_f1": 0.8820862174034119, + "num_tokens": 9497327.0, + "repeat_count": 2.0, + "routers_loss": 0.01593884639441967, + "skip_count": 2.0, + "step": 5890, + "text_loss": 0.5068450570106506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.662166128558848, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00045117432364485927, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 9500488.0, + "repeat_count": 1.0, + "routers_loss": 0.0729660913348198, + "skip_count": 2.0, + "step": 5892, + "text_loss": 0.42718732357025146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00045086629577869127, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9503593.0, + "repeat_count": 0.0, + "routers_loss": 0.007092897780239582, + "skip_count": 2.0, + "step": 5894, + "text_loss": 0.4264345169067383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00045055828674065134, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9507188.0, + "repeat_count": 0.0, + "routers_loss": 0.004088073968887329, + "skip_count": 2.0, + "step": 5896, + "text_loss": 0.20932413637638092 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00045025029664876926, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9510126.0, + "repeat_count": 1.0, + "routers_loss": 0.0026970503386110067, + "skip_count": 0.0, + "step": 5898, + "text_loss": 0.47661110758781433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0004499423256210673, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9513891.0, + "repeat_count": 0.0, + "routers_loss": 0.003428407246246934, + "skip_count": 0.0, + "step": 5900, + "text_loss": 0.18232668936252594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044963437377556066, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9516718.0, + "repeat_count": 0.0, + "routers_loss": 0.0020270352251827717, + "skip_count": 0.0, + "step": 5902, + "text_loss": 0.16833586990833282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.000449326441230257, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 9520248.0, + "repeat_count": 0.0, + "routers_loss": 0.0019144838443025947, + "skip_count": 0.0, + "step": 5904, + "text_loss": 0.44434574246406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00044901852810315634, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9523651.0, + "repeat_count": 0.0, + "routers_loss": 0.0044578867964446545, + "skip_count": 2.0, + "step": 5906, + "text_loss": 0.1248839721083641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004487106345122522, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9527235.0, + "repeat_count": 0.0, + "routers_loss": 0.000827222247608006, + "skip_count": 0.0, + "step": 5908, + "text_loss": 0.6052893996238708 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.74669797475785, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0004484027605755296, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 9530407.0, + "repeat_count": 2.0, + "routers_loss": 0.029739778488874435, + "skip_count": 0.0, + "step": 5910, + "text_loss": 0.7625715732574463 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00044809490641096653, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9533229.0, + "repeat_count": 0.0, + "routers_loss": 0.0025658784434199333, + "skip_count": 0.0, + "step": 5912, + "text_loss": 0.27842655777931213 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 27.76548282946874, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044778707213653324, + "loss": 0.0069, + "macro_f1": 0.9265305995941162, + "num_tokens": 9537397.0, + "repeat_count": 1.0, + "routers_loss": 0.010157953947782516, + "skip_count": 3.0, + "step": 5914, + "text_loss": 0.45196083188056946 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0004474792578701924, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9540564.0, + "repeat_count": 3.0, + "routers_loss": 0.011994685977697372, + "skip_count": 5.0, + "step": 5916, + "text_loss": 0.22617442905902863 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000447171463729899, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9543602.0, + "repeat_count": 0.0, + "routers_loss": 0.0022214490454643965, + "skip_count": 0.0, + "step": 5918, + "text_loss": 0.5089073777198792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004468636898336003, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 9546829.0, + "repeat_count": 1.0, + "routers_loss": 0.009353389963507652, + "skip_count": 2.0, + "step": 5920, + "text_loss": 0.7560386657714844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.057373046875, + "learning_rate": 0.00044655593629923596, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 9550259.0, + "repeat_count": 0.0, + "routers_loss": 0.005637963302433491, + "skip_count": 0.0, + "step": 5922, + "text_loss": 0.17084793746471405 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00044624820324473766, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 9554376.0, + "repeat_count": 1.0, + "routers_loss": 0.008556432090699673, + "skip_count": 2.0, + "step": 5924, + "text_loss": 0.5906872749328613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004459404907880292, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9558348.0, + "repeat_count": 1.0, + "routers_loss": 0.0016659445827826858, + "skip_count": 0.0, + "step": 5926, + "text_loss": 0.8197194933891296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 27.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048828125, + "learning_rate": 0.00044563279904702674, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9561139.0, + "repeat_count": 0.0, + "routers_loss": 0.01341368816792965, + "skip_count": 3.0, + "step": 5928, + "text_loss": 0.3264874815940857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.000445325128139638, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9564387.0, + "repeat_count": 0.0, + "routers_loss": 0.005023977253586054, + "skip_count": 2.0, + "step": 5930, + "text_loss": 0.9055862426757812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004450174781837635, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 9567053.0, + "repeat_count": 0.0, + "routers_loss": 0.0006051476229913533, + "skip_count": 0.0, + "step": 5932, + "text_loss": 0.6908539533615112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0004447098492972951, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9570036.0, + "repeat_count": 0.0, + "routers_loss": 0.003152312943711877, + "skip_count": 0.0, + "step": 5934, + "text_loss": 0.6321061849594116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004444022415981167, + "loss": 0.0094, + "macro_f1": 0.6666666865348816, + "num_tokens": 9574146.0, + "repeat_count": 0.0, + "routers_loss": 0.004859412554651499, + "skip_count": 1.0, + "step": 5936, + "text_loss": 0.5905604958534241 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 27.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.00044409465520410426, + "loss": 0.0071, + "macro_f1": 1.0, + "num_tokens": 9577071.0, + "repeat_count": 1.0, + "routers_loss": 0.004376287572085857, + "skip_count": 1.0, + "step": 5938, + "text_loss": 0.6928377747535706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00044378709023312535, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9580537.0, + "repeat_count": 0.0, + "routers_loss": 0.004038849379867315, + "skip_count": 1.0, + "step": 5940, + "text_loss": 0.2686770558357239 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004434795468030396, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9583225.0, + "repeat_count": 0.0, + "routers_loss": 0.005459951236844063, + "skip_count": 2.0, + "step": 5942, + "text_loss": 0.16855180263519287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 27.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000443172025031698, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9586018.0, + "repeat_count": 0.0, + "routers_loss": 0.0032985717989504337, + "skip_count": 2.0, + "step": 5944, + "text_loss": 0.20335732400417328 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0004428645250369437, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 9589321.0, + "repeat_count": 1.0, + "routers_loss": 0.003573323367163539, + "skip_count": 0.0, + "step": 5946, + "text_loss": 0.6318653225898743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00044255704693661117, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 9592518.0, + "repeat_count": 0.0, + "routers_loss": 0.002226749900728464, + "skip_count": 0.0, + "step": 5948, + "text_loss": 0.5320658683776855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0004422495908485265, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9595664.0, + "repeat_count": 0.0, + "routers_loss": 0.0007805621717125177, + "skip_count": 0.0, + "step": 5950, + "text_loss": 0.6330106258392334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004419421568905077, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9598885.0, + "repeat_count": 0.0, + "routers_loss": 0.0017050127498805523, + "skip_count": 0.0, + "step": 5952, + "text_loss": 0.6098045706748962 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00044163474518036375, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9603021.0, + "repeat_count": 0.0, + "routers_loss": 0.0025974081363528967, + "skip_count": 0.0, + "step": 5954, + "text_loss": 0.2655932903289795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 27.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00044132735583589567, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9605841.0, + "repeat_count": 1.0, + "routers_loss": 0.010364850051701069, + "skip_count": 2.0, + "step": 5956, + "text_loss": 0.3028552532196045 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 27.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.015869140625, + "learning_rate": 0.00044101998897489553, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 9608810.0, + "repeat_count": 1.0, + "routers_loss": 0.0015063622267916799, + "skip_count": 0.0, + "step": 5958, + "text_loss": 0.5602094531059265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 27.981508658643968, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.02880859375, + "learning_rate": 0.00044071264471514683, + "loss": 0.0051, + "macro_f1": 0.5934640765190125, + "num_tokens": 9611995.0, + "repeat_count": 0.0, + "routers_loss": 0.011538165621459484, + "skip_count": 3.0, + "step": 5960, + "text_loss": 0.14332173764705658 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 27.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00044040532317442455, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 9615434.0, + "repeat_count": 0.0, + "routers_loss": 0.004693889059126377, + "skip_count": 0.0, + "step": 5962, + "text_loss": 0.334369033575058 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00044009802447049474, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9618056.0, + "repeat_count": 1.0, + "routers_loss": 0.0045085870660841465, + "skip_count": 1.0, + "step": 5964, + "text_loss": 0.8163170218467712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00043979074872111507, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 9621428.0, + "repeat_count": 0.0, + "routers_loss": 0.0018220023484900594, + "skip_count": 0.0, + "step": 5966, + "text_loss": 0.2513850927352905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004394834960440341, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 9625433.0, + "repeat_count": 4.0, + "routers_loss": 0.007051277905702591, + "skip_count": 5.0, + "step": 5968, + "text_loss": 0.6263421177864075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00043917626655699154, + "loss": 0.0093, + "macro_f1": 0.3333333432674408, + "num_tokens": 9629508.0, + "repeat_count": 0.0, + "routers_loss": 0.0006454752874560654, + "skip_count": 0.0, + "step": 5970, + "text_loss": 0.645618736743927 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0004388690603777184, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 9632504.0, + "repeat_count": 0.0, + "routers_loss": 0.004847112577408552, + "skip_count": 1.0, + "step": 5972, + "text_loss": 0.47306978702545166 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00043856187762393665, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9636685.0, + "repeat_count": 0.0, + "routers_loss": 0.0006580828921869397, + "skip_count": 0.0, + "step": 5974, + "text_loss": 0.42226532101631165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0004382547184133593, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9639958.0, + "repeat_count": 0.0, + "routers_loss": 0.002188180573284626, + "skip_count": 0.0, + "step": 5976, + "text_loss": 0.4456600248813629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004379475828636901, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 9643228.0, + "repeat_count": 1.0, + "routers_loss": 0.0017135308589786291, + "skip_count": 2.0, + "step": 5978, + "text_loss": 0.6295822262763977 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004376404710926244, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9646746.0, + "repeat_count": 0.0, + "routers_loss": 0.0008841048111207783, + "skip_count": 0.0, + "step": 5980, + "text_loss": 0.5102712512016296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00043733338321784784, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9649452.0, + "repeat_count": 0.0, + "routers_loss": 0.0006229099817574024, + "skip_count": 0.0, + "step": 5982, + "text_loss": 0.6944046020507812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000437026319357037, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9652700.0, + "repeat_count": 0.0, + "routers_loss": 0.005293759983032942, + "skip_count": 2.0, + "step": 5984, + "text_loss": 0.6748214960098267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043671927962785946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 9655825.0, + "repeat_count": 0.0, + "routers_loss": 0.0013537590857595205, + "skip_count": 0.0, + "step": 5986, + "text_loss": 1.000306248664856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004364122641479733, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9658713.0, + "repeat_count": 0.0, + "routers_loss": 0.004548195283859968, + "skip_count": 0.0, + "step": 5988, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 28.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004361052730350275, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9661535.0, + "repeat_count": 0.0, + "routers_loss": 0.011149964295327663, + "skip_count": 4.0, + "step": 5990, + "text_loss": 0.5737863779067993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00043579830640666154, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 9664406.0, + "repeat_count": 1.0, + "routers_loss": 0.003783488878980279, + "skip_count": 1.0, + "step": 5992, + "text_loss": 0.7836558222770691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00043549136438050573, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 9669050.0, + "repeat_count": 0.0, + "routers_loss": 0.0050374288111925125, + "skip_count": 1.0, + "step": 5994, + "text_loss": 0.13072487711906433 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.150278837687114, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043518444707418076, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9672698.0, + "repeat_count": 0.0, + "routers_loss": 0.004047670867294073, + "skip_count": 2.0, + "step": 5996, + "text_loss": 0.4748993217945099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00043487755460529796, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 9676159.0, + "repeat_count": 0.0, + "routers_loss": 0.008628991432487965, + "skip_count": 2.0, + "step": 5998, + "text_loss": 0.1921990066766739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00043457068709145904, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 9679528.0, + "repeat_count": 3.0, + "routers_loss": 0.01094671618193388, + "skip_count": 3.0, + "step": 6000, + "text_loss": 0.3651769459247589 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 28.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00043426384465025604, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 9682677.0, + "repeat_count": 2.0, + "routers_loss": 0.0011284075444564223, + "skip_count": 0.0, + "step": 6002, + "text_loss": 0.28305181860923767 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.000433957027399272, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9685310.0, + "repeat_count": 0.0, + "routers_loss": 0.0030473743099719286, + "skip_count": 1.0, + "step": 6004, + "text_loss": 0.3650054931640625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00043365023545607965, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 9687944.0, + "repeat_count": 1.0, + "routers_loss": 0.011621905490756035, + "skip_count": 2.0, + "step": 6006, + "text_loss": 0.5409000515937805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004333434689382423, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 9690932.0, + "repeat_count": 0.0, + "routers_loss": 0.0005297541501931846, + "skip_count": 0.0, + "step": 6008, + "text_loss": 0.4311029314994812 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.216025829175226, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00043303672796331336, + "loss": 0.0058, + "macro_f1": 0.3272727429866791, + "num_tokens": 9693972.0, + "repeat_count": 1.0, + "routers_loss": 0.06166421249508858, + "skip_count": 0.0, + "step": 6010, + "text_loss": 0.2658997178077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00043273001264883655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9697712.0, + "repeat_count": 0.0, + "routers_loss": 0.0018419031985104084, + "skip_count": 0.0, + "step": 6012, + "text_loss": 0.5813497304916382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004324233231123458, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9700746.0, + "repeat_count": 0.0, + "routers_loss": 0.003635555040091276, + "skip_count": 0.0, + "step": 6014, + "text_loss": 0.24211904406547546 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 28.24420311124156, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004321166594713651, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 9704087.0, + "repeat_count": 0.0, + "routers_loss": 0.021067705005407333, + "skip_count": 2.0, + "step": 6016, + "text_loss": 0.5908042788505554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00043181002184340857, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 9708695.0, + "repeat_count": 0.0, + "routers_loss": 0.0008712753187865019, + "skip_count": 0.0, + "step": 6018, + "text_loss": 0.7788549661636353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004315034103459803, + "loss": 0.0054, + "macro_f1": 0.3272727429866791, + "num_tokens": 9711631.0, + "repeat_count": 1.0, + "routers_loss": 0.03231092542409897, + "skip_count": 0.0, + "step": 6020, + "text_loss": 0.6127741932868958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.0004311968250965743, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9715526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020149527117609978, + "skip_count": 2.0, + "step": 6022, + "text_loss": 0.49970078468322754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004308902662126748, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9718475.0, + "repeat_count": 0.0, + "routers_loss": 0.0031795913819223642, + "skip_count": 0.0, + "step": 6024, + "text_loss": 0.3254713714122772 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.291165248018785, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00043058373381175567, + "loss": 0.004, + "macro_f1": 0.3272727429866791, + "num_tokens": 9722194.0, + "repeat_count": 0.0, + "routers_loss": 0.0148378387093544, + "skip_count": 1.0, + "step": 6026, + "text_loss": 0.17670343816280365 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0004302772280112806, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 9725489.0, + "repeat_count": 1.0, + "routers_loss": 0.005742347799241543, + "skip_count": 2.0, + "step": 6028, + "text_loss": 0.26184776425361633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00042997074892870335, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9729416.0, + "repeat_count": 0.0, + "routers_loss": 0.0023561837151646614, + "skip_count": 0.0, + "step": 6030, + "text_loss": 0.3026008605957031 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.31934253008512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0004296642966814673, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 9732559.0, + "repeat_count": 0.0, + "routers_loss": 0.0010108393616974354, + "skip_count": 1.0, + "step": 6032, + "text_loss": 0.43198078870773315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00042935787138700525, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 9736324.0, + "repeat_count": 2.0, + "routers_loss": 0.005443581845611334, + "skip_count": 2.0, + "step": 6034, + "text_loss": 0.24883155524730682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0004290514731627403, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 9739630.0, + "repeat_count": 1.0, + "routers_loss": 0.010645060800015926, + "skip_count": 2.0, + "step": 6036, + "text_loss": 0.24207182228565216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.0004287451021260846, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9742221.0, + "repeat_count": 0.0, + "routers_loss": 0.0008162845042534173, + "skip_count": 0.0, + "step": 6038, + "text_loss": 0.33018553256988525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.356912239506897, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0004284387583944403, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9744925.0, + "repeat_count": 0.0, + "routers_loss": 0.003782407147809863, + "skip_count": 1.0, + "step": 6040, + "text_loss": 0.6600399613380432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0004281324420851987, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9748103.0, + "repeat_count": 0.0, + "routers_loss": 0.0009834285592660308, + "skip_count": 0.0, + "step": 6042, + "text_loss": 0.6402350664138794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0004278261533157409, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9751128.0, + "repeat_count": 0.0, + "routers_loss": 0.004100334830582142, + "skip_count": 2.0, + "step": 6044, + "text_loss": 0.1545136719942093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0004275198922034372, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 9754140.0, + "repeat_count": 0.0, + "routers_loss": 0.0017166603356599808, + "skip_count": 1.0, + "step": 6046, + "text_loss": 0.5875935554504395 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042721365886564766, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9756945.0, + "repeat_count": 1.0, + "routers_loss": 0.00915827602148056, + "skip_count": 2.0, + "step": 6048, + "text_loss": 0.3885214328765869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00042690745341972134, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9759738.0, + "repeat_count": 0.0, + "routers_loss": 0.0057020667009055614, + "skip_count": 2.0, + "step": 6050, + "text_loss": 0.3107164204120636 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00042660127598299647, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 9762987.0, + "repeat_count": 0.0, + "routers_loss": 0.004196313209831715, + "skip_count": 2.0, + "step": 6052, + "text_loss": 0.3073577582836151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00042629512667280135, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 9765828.0, + "repeat_count": 0.0, + "routers_loss": 0.0023119752295315266, + "skip_count": 1.0, + "step": 6054, + "text_loss": 0.8228643536567688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004259890056064527, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 9769129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021007524337619543, + "skip_count": 1.0, + "step": 6056, + "text_loss": 0.8334706425666809 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0004256829129012568, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9771821.0, + "repeat_count": 1.0, + "routers_loss": 0.00671970471739769, + "skip_count": 2.0, + "step": 6058, + "text_loss": 0.17845536768436432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00042537684867450875, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9774566.0, + "repeat_count": 0.0, + "routers_loss": 0.0014770646812394261, + "skip_count": 0.0, + "step": 6060, + "text_loss": 0.4445459246635437 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.46022894041679, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00042507081304349315, + "loss": 0.0067, + "macro_f1": 0.5492662787437439, + "num_tokens": 9777909.0, + "repeat_count": 2.0, + "routers_loss": 0.014822427183389664, + "skip_count": 0.0, + "step": 6062, + "text_loss": 0.45526158809661865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004247648061254833, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 9781159.0, + "repeat_count": 0.0, + "routers_loss": 0.00568385748192668, + "skip_count": 1.0, + "step": 6064, + "text_loss": 0.18535588681697845 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.479013795127678, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00042445882803774173, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 9784960.0, + "repeat_count": 1.0, + "routers_loss": 0.0179694052785635, + "skip_count": 0.0, + "step": 6066, + "text_loss": 0.23591181635856628 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00042415287889751966, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 9787941.0, + "repeat_count": 0.0, + "routers_loss": 0.0019039154285565019, + "skip_count": 0.0, + "step": 6068, + "text_loss": 0.9447930455207825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004238469588220575, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 9791096.0, + "repeat_count": 0.0, + "routers_loss": 0.004039563238620758, + "skip_count": 0.0, + "step": 6070, + "text_loss": 0.3134256601333618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00042354106792858446, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 9794082.0, + "repeat_count": 0.0, + "routers_loss": 0.0018352365586906672, + "skip_count": 0.0, + "step": 6072, + "text_loss": 0.5681536197662354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00042323520633431833, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9797303.0, + "repeat_count": 0.0, + "routers_loss": 0.0019325513858348131, + "skip_count": 0.0, + "step": 6074, + "text_loss": 0.2835809290409088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00042292937415646574, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9800435.0, + "repeat_count": 0.0, + "routers_loss": 0.002513401210308075, + "skip_count": 0.0, + "step": 6076, + "text_loss": 0.1931663602590561 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00042262357151222265, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 9803873.0, + "repeat_count": 0.0, + "routers_loss": 0.004864581860601902, + "skip_count": 0.0, + "step": 6078, + "text_loss": 0.25809767842292786 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004223177985187728, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9806438.0, + "repeat_count": 1.0, + "routers_loss": 0.004932792857289314, + "skip_count": 0.0, + "step": 6080, + "text_loss": 0.6409249305725098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00042201205529328925, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 9809400.0, + "repeat_count": 0.0, + "routers_loss": 0.00590938376262784, + "skip_count": 1.0, + "step": 6082, + "text_loss": 0.31158050894737244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00042170634195293314, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9813246.0, + "repeat_count": 0.0, + "routers_loss": 0.006805860437452793, + "skip_count": 0.0, + "step": 6084, + "text_loss": 0.32945963740348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004214006586148545, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 9816513.0, + "repeat_count": 0.0, + "routers_loss": 0.0010186503641307354, + "skip_count": 0.0, + "step": 6086, + "text_loss": 0.48659923672676086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0004210950053961917, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9819908.0, + "repeat_count": 0.0, + "routers_loss": 0.00402973173186183, + "skip_count": 1.0, + "step": 6088, + "text_loss": 0.6249601244926453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00042078938241407174, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 9822950.0, + "repeat_count": 0.0, + "routers_loss": 0.00236532068811357, + "skip_count": 1.0, + "step": 6090, + "text_loss": 0.26589256525039673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004204837897856098, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 9826493.0, + "repeat_count": 1.0, + "routers_loss": 0.003072192659601569, + "skip_count": 2.0, + "step": 6092, + "text_loss": 0.5216912627220154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0004201782276279096, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 9829698.0, + "repeat_count": 0.0, + "routers_loss": 0.0027553171385079622, + "skip_count": 1.0, + "step": 6094, + "text_loss": 0.40127676725387573 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.61990020545935, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00041987269605806325, + "loss": 0.0045, + "macro_f1": 0.9442509412765503, + "num_tokens": 9833719.0, + "repeat_count": 4.0, + "routers_loss": 0.013845407404005527, + "skip_count": 4.0, + "step": 6096, + "text_loss": 0.23114071786403656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004195671951931509, + "loss": 0.0116, + "macro_f1": 0.6666666865348816, + "num_tokens": 9838235.0, + "repeat_count": 0.0, + "routers_loss": 0.0019887303933501244, + "skip_count": 2.0, + "step": 6098, + "text_loss": 0.7467341423034668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004192617251502409, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9840867.0, + "repeat_count": 0.0, + "routers_loss": 0.0007213905337266624, + "skip_count": 0.0, + "step": 6100, + "text_loss": 0.6283472180366516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00041895628604639036, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 9843827.0, + "repeat_count": 0.0, + "routers_loss": 0.003863139310851693, + "skip_count": 1.0, + "step": 6102, + "text_loss": 0.3602744936943054 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00041865087799864374, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9846939.0, + "repeat_count": 0.0, + "routers_loss": 0.0013336286647245288, + "skip_count": 0.0, + "step": 6104, + "text_loss": 0.4182434678077698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0004183455011240341, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 9849827.0, + "repeat_count": 0.0, + "routers_loss": 0.00038455065805464983, + "skip_count": 0.0, + "step": 6106, + "text_loss": 0.7122722864151001 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 28.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0004180401555395826, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 9853487.0, + "repeat_count": 3.0, + "routers_loss": 0.0038226440083235502, + "skip_count": 1.0, + "step": 6108, + "text_loss": 0.2521185576915741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0004177348413622981, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 9856321.0, + "repeat_count": 0.0, + "routers_loss": 0.0015809801407158375, + "skip_count": 0.0, + "step": 6110, + "text_loss": 0.423979252576828 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004174295587091776, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9859238.0, + "repeat_count": 0.0, + "routers_loss": 0.0007586454739794135, + "skip_count": 0.0, + "step": 6112, + "text_loss": 0.4720100462436676 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00041712430769720593, + "loss": 0.0091, + "macro_f1": 1.0, + "num_tokens": 9862282.0, + "repeat_count": 1.0, + "routers_loss": 0.0045816488564014435, + "skip_count": 1.0, + "step": 6114, + "text_loss": 0.279577374458313 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0004168190884433559, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 9865394.0, + "repeat_count": 1.0, + "routers_loss": 0.004728195257484913, + "skip_count": 1.0, + "step": 6116, + "text_loss": 0.3826395571231842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0004165139010645881, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 9869165.0, + "repeat_count": 0.0, + "routers_loss": 0.006160226184874773, + "skip_count": 3.0, + "step": 6118, + "text_loss": 0.4668935537338257 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 24.0, + "epoch": 28.732609333724685, + "f1_execute": 0.9767441749572754, + "f1_repeat": 1.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004162087456778509, + "loss": 0.0074, + "macro_f1": 0.9619450569152832, + "num_tokens": 9872381.0, + "repeat_count": 1.0, + "routers_loss": 0.027831824496388435, + "skip_count": 6.0, + "step": 6120, + "text_loss": 0.28708913922309875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004159036224000804, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9875668.0, + "repeat_count": 0.0, + "routers_loss": 0.0030764432158321142, + "skip_count": 1.0, + "step": 6122, + "text_loss": 0.37078607082366943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0004155985313482002, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 9878533.0, + "repeat_count": 0.0, + "routers_loss": 0.00043521137558855116, + "skip_count": 0.0, + "step": 6124, + "text_loss": 0.34975379705429077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00041529347263912224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 9881478.0, + "repeat_count": 0.0, + "routers_loss": 0.0016251741908490658, + "skip_count": 0.0, + "step": 6126, + "text_loss": 0.39166271686553955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.770179043146463, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00041498844638974535, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 9884252.0, + "repeat_count": 1.0, + "routers_loss": 0.019553523510694504, + "skip_count": 0.0, + "step": 6128, + "text_loss": 0.2309480905532837 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0004146834527169562, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 9887485.0, + "repeat_count": 1.0, + "routers_loss": 0.0036251386627554893, + "skip_count": 0.0, + "step": 6130, + "text_loss": 0.4464457631111145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00041437849173762894, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 9890711.0, + "repeat_count": 0.0, + "routers_loss": 0.0008515548543073237, + "skip_count": 0.0, + "step": 6132, + "text_loss": 0.5012133717536926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0004140735635686251, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 9894458.0, + "repeat_count": 1.0, + "routers_loss": 0.001084602321498096, + "skip_count": 0.0, + "step": 6134, + "text_loss": 0.32015663385391235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.0004137686683267938, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 9897634.0, + "repeat_count": 0.0, + "routers_loss": 0.0025203595869243145, + "skip_count": 0.0, + "step": 6136, + "text_loss": 0.15804508328437805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048828125, + "learning_rate": 0.0004134638061289715, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 9901157.0, + "repeat_count": 0.0, + "routers_loss": 0.0029381231870502234, + "skip_count": 0.0, + "step": 6138, + "text_loss": 0.14375236630439758 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0004131589770919819, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9903958.0, + "repeat_count": 0.0, + "routers_loss": 0.002789110178127885, + "skip_count": 0.0, + "step": 6140, + "text_loss": 0.2474033683538437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004128541813326361, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 9906799.0, + "repeat_count": 2.0, + "routers_loss": 0.010770512744784355, + "skip_count": 3.0, + "step": 6142, + "text_loss": 0.2304249256849289 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0004125494189677325, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 9909286.0, + "repeat_count": 1.0, + "routers_loss": 0.003122122259810567, + "skip_count": 0.0, + "step": 6144, + "text_loss": 0.3781827688217163 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 28.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00041224469011405643, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 9912416.0, + "repeat_count": 1.0, + "routers_loss": 0.008443298749625683, + "skip_count": 1.0, + "step": 6146, + "text_loss": 0.3004767596721649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004119399948883806, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 9915290.0, + "repeat_count": 0.0, + "routers_loss": 0.0033219947945326567, + "skip_count": 1.0, + "step": 6148, + "text_loss": 0.748744547367096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0004116353334074647, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 9918493.0, + "repeat_count": 1.0, + "routers_loss": 0.005501769948750734, + "skip_count": 0.0, + "step": 6150, + "text_loss": 0.330759733915329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.000411330705788056, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 9921027.0, + "repeat_count": 0.0, + "routers_loss": 0.0013694261433556676, + "skip_count": 0.0, + "step": 6152, + "text_loss": 0.43070924282073975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0164794921875, + "learning_rate": 0.000411026112146888, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 9924303.0, + "repeat_count": 0.0, + "routers_loss": 0.00046192589798010886, + "skip_count": 0.0, + "step": 6154, + "text_loss": 0.5674887895584106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 28.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0004107215526006817, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9927065.0, + "repeat_count": 1.0, + "routers_loss": 0.004311304073780775, + "skip_count": 0.0, + "step": 6156, + "text_loss": 0.16138267517089844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004104170272661449, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 9930713.0, + "repeat_count": 0.0, + "routers_loss": 0.0035845425445586443, + "skip_count": 0.0, + "step": 6158, + "text_loss": 0.18728356063365936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00041011253625997227, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 9934393.0, + "repeat_count": 0.0, + "routers_loss": 0.00247366214171052, + "skip_count": 0.0, + "step": 6160, + "text_loss": 0.3624019920825958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004098080796988452, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 9937457.0, + "repeat_count": 0.0, + "routers_loss": 0.003240241203457117, + "skip_count": 0.0, + "step": 6162, + "text_loss": 0.12348521500825882 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0004095036576994321, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 9940523.0, + "repeat_count": 0.0, + "routers_loss": 0.001985874492675066, + "skip_count": 1.0, + "step": 6164, + "text_loss": 0.2688066363334656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 28.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00040919927037838815, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 9943802.0, + "repeat_count": 0.0, + "routers_loss": 0.004264154937118292, + "skip_count": 3.0, + "step": 6166, + "text_loss": 0.49316367506980896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0556640625, + "learning_rate": 0.00040889491785235513, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 9946649.0, + "repeat_count": 0.0, + "routers_loss": 0.002545441733673215, + "skip_count": 0.0, + "step": 6168, + "text_loss": 0.4079313576221466 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.967420017610802, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004085906002379614, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 9949800.0, + "repeat_count": 0.0, + "routers_loss": 0.0009590961271896958, + "skip_count": 0.0, + "step": 6170, + "text_loss": 0.6166561245918274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 28.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004082863176518221, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 9954008.0, + "repeat_count": 0.0, + "routers_loss": 0.003795337164774537, + "skip_count": 2.0, + "step": 6172, + "text_loss": 0.4791361689567566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 28.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044921875, + "learning_rate": 0.0004079820702105388, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 9957153.0, + "repeat_count": 0.0, + "routers_loss": 0.0015634822193533182, + "skip_count": 0.0, + "step": 6174, + "text_loss": 0.7208777666091919 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 28.995597299677137, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0004076778580306999, + "loss": 0.0056, + "macro_f1": 0.8820862174034119, + "num_tokens": 9960060.0, + "repeat_count": 2.0, + "routers_loss": 0.03223998099565506, + "skip_count": 2.0, + "step": 6176, + "text_loss": 0.6617992520332336 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00040737368122887983, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 9963396.0, + "repeat_count": 0.0, + "routers_loss": 0.0033978577703237534, + "skip_count": 0.0, + "step": 6178, + "text_loss": 0.7339215278625488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00040706953992164, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 9966364.0, + "repeat_count": 0.0, + "routers_loss": 0.0005358994239941239, + "skip_count": 0.0, + "step": 6180, + "text_loss": 0.44187214970588684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.023481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040676543422552767, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 9969813.0, + "repeat_count": 0.0, + "routers_loss": 0.0018544091144576669, + "skip_count": 1.0, + "step": 6182, + "text_loss": 0.6244927048683167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004064613642570769, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 9973015.0, + "repeat_count": 0.0, + "routers_loss": 0.005692692007869482, + "skip_count": 0.0, + "step": 6184, + "text_loss": 0.18860043585300446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00040615733013280784, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 9976201.0, + "repeat_count": 0.0, + "routers_loss": 0.0018737476784735918, + "skip_count": 0.0, + "step": 6186, + "text_loss": 0.21189232170581818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00040585333196922687, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 9979711.0, + "repeat_count": 0.0, + "routers_loss": 0.011945146135985851, + "skip_count": 2.0, + "step": 6188, + "text_loss": 0.2628154456615448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00040554936988282663, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 9983003.0, + "repeat_count": 0.0, + "routers_loss": 0.0036045778542757034, + "skip_count": 1.0, + "step": 6190, + "text_loss": 0.5926038026809692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0004052454439900861, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 9986841.0, + "repeat_count": 0.0, + "routers_loss": 0.004170368425548077, + "skip_count": 0.0, + "step": 6192, + "text_loss": 0.3088737726211548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00040494155440747015, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 9989596.0, + "repeat_count": 0.0, + "routers_loss": 0.002254750579595566, + "skip_count": 2.0, + "step": 6194, + "text_loss": 0.6309700012207031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.089228059876724, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00040463770125142987, + "loss": 0.0087, + "macro_f1": 0.8814815282821655, + "num_tokens": 9992789.0, + "repeat_count": 2.0, + "routers_loss": 0.04092822223901749, + "skip_count": 4.0, + "step": 6196, + "text_loss": 0.09625697880983353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00040433388463840213, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 9995782.0, + "repeat_count": 0.0, + "routers_loss": 0.00029065192211419344, + "skip_count": 0.0, + "step": 6198, + "text_loss": 0.5600258111953735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004040301046848105, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 9998712.0, + "repeat_count": 0.0, + "routers_loss": 0.0005865268758498132, + "skip_count": 0.0, + "step": 6200, + "text_loss": 0.6426429748535156 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.11740534194306, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0283203125, + "learning_rate": 0.0004037263615070638, + "loss": 0.0078, + "macro_f1": 0.9265305995941162, + "num_tokens": 10002020.0, + "repeat_count": 1.0, + "routers_loss": 0.025357060134410858, + "skip_count": 3.0, + "step": 6202, + "text_loss": 0.25125735998153687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.000403422655221557, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10005381.0, + "repeat_count": 0.0, + "routers_loss": 0.003139561740681529, + "skip_count": 1.0, + "step": 6204, + "text_loss": 0.3639419376850128 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00040311898594467085, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10008348.0, + "repeat_count": 0.0, + "routers_loss": 0.004091196693480015, + "skip_count": 2.0, + "step": 6206, + "text_loss": 0.1602363884449005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00040281535379277204, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10011171.0, + "repeat_count": 0.0, + "routers_loss": 0.005771483760327101, + "skip_count": 0.0, + "step": 6208, + "text_loss": 0.5593504905700684 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000402511758882213, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10014374.0, + "repeat_count": 0.0, + "routers_loss": 0.005212264601141214, + "skip_count": 1.0, + "step": 6210, + "text_loss": 0.15668229758739471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0004022082013293319, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10017327.0, + "repeat_count": 0.0, + "routers_loss": 0.0027585842180997133, + "skip_count": 1.0, + "step": 6212, + "text_loss": 0.21188466250896454 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.173759906075727, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00040190468125045255, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10020518.0, + "repeat_count": 0.0, + "routers_loss": 0.013210589066147804, + "skip_count": 1.0, + "step": 6214, + "text_loss": 0.2551073729991913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00040160119876188436, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10023799.0, + "repeat_count": 1.0, + "routers_loss": 0.001590219559147954, + "skip_count": 0.0, + "step": 6216, + "text_loss": 0.5634782314300537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0004012977539799224, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 10027107.0, + "repeat_count": 0.0, + "routers_loss": 0.003917343448847532, + "skip_count": 0.0, + "step": 6218, + "text_loss": 0.6412819027900696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0004009943470208473, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 10030460.0, + "repeat_count": 0.0, + "routers_loss": 0.00874288845807314, + "skip_count": 2.0, + "step": 6220, + "text_loss": 0.13269923627376556 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.211329615497505, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.000400690978000925, + "loss": 0.0075, + "macro_f1": 0.8817967176437378, + "num_tokens": 10034086.0, + "repeat_count": 2.0, + "routers_loss": 0.03736349940299988, + "skip_count": 3.0, + "step": 6222, + "text_loss": 0.4956454336643219 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.220722042852948, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0004003876470364075, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10037312.0, + "repeat_count": 0.0, + "routers_loss": 0.008481289260089397, + "skip_count": 2.0, + "step": 6224, + "text_loss": 0.2148810178041458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0004000843542435315, + "loss": 0.0028, + "macro_f1": 0.3333333432674408, + "num_tokens": 10040393.0, + "repeat_count": 0.0, + "routers_loss": 0.002235144842416048, + "skip_count": 0.0, + "step": 6226, + "text_loss": 0.17645306885242462 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003997810997385195, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10044386.0, + "repeat_count": 1.0, + "routers_loss": 0.004541373811662197, + "skip_count": 0.0, + "step": 6228, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00039947788363757915, + "loss": 0.0088, + "macro_f1": 0.6666666865348816, + "num_tokens": 10049046.0, + "repeat_count": 0.0, + "routers_loss": 0.0019183673430234194, + "skip_count": 1.0, + "step": 6230, + "text_loss": 0.6953724026679993 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00039917470605690334, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 10051787.0, + "repeat_count": 2.0, + "routers_loss": 0.0032311067916452885, + "skip_count": 4.0, + "step": 6232, + "text_loss": 0.475127637386322 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 29.267684179630173, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00039887156711267043, + "loss": 0.0079, + "macro_f1": 0.5492662787437439, + "num_tokens": 10055396.0, + "repeat_count": 2.0, + "routers_loss": 0.03247373178601265, + "skip_count": 0.0, + "step": 6234, + "text_loss": 0.4239100515842438 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00039856846692104363, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10058395.0, + "repeat_count": 0.0, + "routers_loss": 0.006287421099841595, + "skip_count": 3.0, + "step": 6236, + "text_loss": 0.24084535241127014 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.0003982654055981718, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10061302.0, + "repeat_count": 1.0, + "routers_loss": 0.0008686117362231016, + "skip_count": 1.0, + "step": 6238, + "text_loss": 0.4740419089794159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0003979623832601884, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10065318.0, + "repeat_count": 0.0, + "routers_loss": 0.0037686119321733713, + "skip_count": 2.0, + "step": 6240, + "text_loss": 0.43965795636177063 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0003976594000232123, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10068291.0, + "repeat_count": 0.0, + "routers_loss": 0.005804901942610741, + "skip_count": 0.0, + "step": 6242, + "text_loss": 0.24424348771572113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00039735645600334714, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10071645.0, + "repeat_count": 0.0, + "routers_loss": 0.002001055981963873, + "skip_count": 1.0, + "step": 6244, + "text_loss": 0.6524377465248108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0003970535513166815, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10075136.0, + "repeat_count": 0.0, + "routers_loss": 0.001252001617103815, + "skip_count": 0.0, + "step": 6246, + "text_loss": 0.22803714871406555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0003967506860792893, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10078230.0, + "repeat_count": 0.0, + "routers_loss": 0.004913780372589827, + "skip_count": 1.0, + "step": 6248, + "text_loss": 0.9835516214370728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.000396447860407229, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10080852.0, + "repeat_count": 0.0, + "routers_loss": 0.0037437966093420982, + "skip_count": 2.0, + "step": 6250, + "text_loss": 0.4021640121936798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05078125, + "learning_rate": 0.00039614507441654393, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10084139.0, + "repeat_count": 0.0, + "routers_loss": 0.005433002021163702, + "skip_count": 2.0, + "step": 6252, + "text_loss": 0.23060470819473267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00039584232822326224, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10088501.0, + "repeat_count": 0.0, + "routers_loss": 0.0007705377647653222, + "skip_count": 0.0, + "step": 6254, + "text_loss": 0.5994830131530762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003955396219433969, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10091506.0, + "repeat_count": 0.0, + "routers_loss": 0.0012310115853324533, + "skip_count": 0.0, + "step": 6256, + "text_loss": 0.4639038145542145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0003952369556929455, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10096236.0, + "repeat_count": 0.0, + "routers_loss": 0.008964627049863338, + "skip_count": 2.0, + "step": 6258, + "text_loss": 0.24845287203788757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003949343295878903, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10099213.0, + "repeat_count": 0.0, + "routers_loss": 0.0033088945783674717, + "skip_count": 0.0, + "step": 6260, + "text_loss": 0.6527073979377747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00039463174374419817, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10103160.0, + "repeat_count": 2.0, + "routers_loss": 0.003462672932073474, + "skip_count": 1.0, + "step": 6262, + "text_loss": 0.4209299683570862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00039432919827782066, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10105881.0, + "repeat_count": 2.0, + "routers_loss": 0.0027124532498419285, + "skip_count": 2.0, + "step": 6264, + "text_loss": 0.4442266821861267 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00039402669330469367, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10108596.0, + "repeat_count": 0.0, + "routers_loss": 0.005055282264947891, + "skip_count": 2.0, + "step": 6266, + "text_loss": 0.3331456780433655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00039372422894073765, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10111673.0, + "repeat_count": 0.0, + "routers_loss": 0.0009340311517007649, + "skip_count": 0.0, + "step": 6268, + "text_loss": 0.7664456367492676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.436747872028178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00039342180530185745, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10116141.0, + "repeat_count": 0.0, + "routers_loss": 0.00032052272581495345, + "skip_count": 0.0, + "step": 6270, + "text_loss": 0.47610244154930115 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00039311942250394274, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10119151.0, + "repeat_count": 0.0, + "routers_loss": 0.0015820999396964908, + "skip_count": 0.0, + "step": 6272, + "text_loss": 0.3815282881259918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003928170806628669, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10122684.0, + "repeat_count": 0.0, + "routers_loss": 0.0007423736387863755, + "skip_count": 0.0, + "step": 6274, + "text_loss": 0.4630914628505707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00039251477989448797, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10126751.0, + "repeat_count": 0.0, + "routers_loss": 0.0006216703332029283, + "skip_count": 0.0, + "step": 6276, + "text_loss": 0.4342454671859741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.00039221252031464816, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10129784.0, + "repeat_count": 0.0, + "routers_loss": 0.004239698871970177, + "skip_count": 3.0, + "step": 6278, + "text_loss": 0.24661089479923248 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 29.4837100088054, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003919103020391738, + "loss": 0.006, + "macro_f1": 0.8803418874740601, + "num_tokens": 10133066.0, + "repeat_count": 2.0, + "routers_loss": 0.027879100292921066, + "skip_count": 7.0, + "step": 6280, + "text_loss": 0.4705188274383545 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00039160812518387574, + "loss": 0.0099, + "macro_f1": 0.3333333432674408, + "num_tokens": 10136860.0, + "repeat_count": 0.0, + "routers_loss": 0.002533538034185767, + "skip_count": 0.0, + "step": 6282, + "text_loss": 0.1953880786895752 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00039130598986454845, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 10140066.0, + "repeat_count": 1.0, + "routers_loss": 0.002462630858644843, + "skip_count": 2.0, + "step": 6284, + "text_loss": 0.378487765789032 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.000391003896196971, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 10143646.0, + "repeat_count": 1.0, + "routers_loss": 0.011922914534807205, + "skip_count": 1.0, + "step": 6286, + "text_loss": 0.2467316836118698 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00039070184429690607, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10146507.0, + "repeat_count": 1.0, + "routers_loss": 0.0059767309576272964, + "skip_count": 1.0, + "step": 6288, + "text_loss": 0.9603674411773682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003903998342801006, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10149301.0, + "repeat_count": 1.0, + "routers_loss": 0.0030056277755647898, + "skip_count": 2.0, + "step": 6290, + "text_loss": 0.36631715297698975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00039009786626228543, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10152158.0, + "repeat_count": 0.0, + "routers_loss": 0.005298118572682142, + "skip_count": 3.0, + "step": 6292, + "text_loss": 0.2876455783843994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003897959403591751, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 10155852.0, + "repeat_count": 0.0, + "routers_loss": 0.004937763791531324, + "skip_count": 2.0, + "step": 6294, + "text_loss": 0.14649681746959686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003894940566864683, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 10159164.0, + "repeat_count": 0.0, + "routers_loss": 0.0021474575623869896, + "skip_count": 0.0, + "step": 6296, + "text_loss": 0.5694304704666138 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 29.568241855004402, + "f1_execute": 0.9583333134651184, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.08251953125, + "learning_rate": 0.00038919221535984753, + "loss": 0.0073, + "macro_f1": 0.875, + "num_tokens": 10161806.0, + "repeat_count": 1.0, + "routers_loss": 0.040340203791856766, + "skip_count": 3.0, + "step": 6298, + "text_loss": 0.1574537754058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038889041649497894, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10165669.0, + "repeat_count": 0.0, + "routers_loss": 0.0028486931696534157, + "skip_count": 0.0, + "step": 6300, + "text_loss": 0.9158071279525757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003885886602075123, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10168945.0, + "repeat_count": 0.0, + "routers_loss": 0.006565484683960676, + "skip_count": 2.0, + "step": 6302, + "text_loss": 0.3530846834182739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00038828694661308116, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10171914.0, + "repeat_count": 0.0, + "routers_loss": 0.0009084723424166441, + "skip_count": 0.0, + "step": 6304, + "text_loss": 0.4603337347507477 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0003879852758273029, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10175737.0, + "repeat_count": 1.0, + "routers_loss": 0.004121702630072832, + "skip_count": 2.0, + "step": 6306, + "text_loss": 0.5294032096862793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00038768364796577814, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10178543.0, + "repeat_count": 0.0, + "routers_loss": 0.0013208909658715129, + "skip_count": 0.0, + "step": 6308, + "text_loss": 0.41084006428718567 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.62459641913707, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00038738206314409144, + "loss": 0.0079, + "macro_f1": 0.9247862696647644, + "num_tokens": 10181880.0, + "repeat_count": 3.0, + "routers_loss": 0.03674180060625076, + "skip_count": 6.0, + "step": 6310, + "text_loss": 0.6920746564865112 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0003870805214778106, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10185173.0, + "repeat_count": 0.0, + "routers_loss": 0.00221974472515285, + "skip_count": 2.0, + "step": 6312, + "text_loss": 0.1376657634973526 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0003867790230824869, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10188642.0, + "repeat_count": 0.0, + "routers_loss": 0.001809283159673214, + "skip_count": 0.0, + "step": 6314, + "text_loss": 0.5220870971679688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0003864775680736552, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10191750.0, + "repeat_count": 0.0, + "routers_loss": 0.0013956360053271055, + "skip_count": 0.0, + "step": 6316, + "text_loss": 0.4109838902950287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00038617615656683356, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10194578.0, + "repeat_count": 0.0, + "routers_loss": 0.002947692759335041, + "skip_count": 2.0, + "step": 6318, + "text_loss": 0.4818590581417084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003858747886775232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10197131.0, + "repeat_count": 0.0, + "routers_loss": 0.0008140999125316739, + "skip_count": 2.0, + "step": 6320, + "text_loss": 0.4004709720611572 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.68095098326974, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003855734645212093, + "loss": 0.0089, + "macro_f1": 0.8820862174034119, + "num_tokens": 10199965.0, + "repeat_count": 2.0, + "routers_loss": 0.013056626543402672, + "skip_count": 2.0, + "step": 6322, + "text_loss": 0.3367139995098114 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00038527218421335977, + "loss": 0.0087, + "macro_f1": 1.0, + "num_tokens": 10203184.0, + "repeat_count": 1.0, + "routers_loss": 0.0038112467154860497, + "skip_count": 2.0, + "step": 6324, + "text_loss": 0.5747989416122437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003849709478694255, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 10206436.0, + "repeat_count": 0.0, + "routers_loss": 0.001232540002092719, + "skip_count": 0.0, + "step": 6326, + "text_loss": 0.4981732964515686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00038466975560484115, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10209889.0, + "repeat_count": 0.0, + "routers_loss": 0.004343799781054258, + "skip_count": 0.0, + "step": 6328, + "text_loss": 0.2160186469554901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.000384368607535024, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10212520.0, + "repeat_count": 0.0, + "routers_loss": 0.0014161963481456041, + "skip_count": 1.0, + "step": 6330, + "text_loss": 0.3556232154369354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0003840675037753745, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10215456.0, + "repeat_count": 0.0, + "routers_loss": 0.0014989010524004698, + "skip_count": 0.0, + "step": 6332, + "text_loss": 0.8510926961898804 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003837664444412762, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10218558.0, + "repeat_count": 0.0, + "routers_loss": 0.006702739745378494, + "skip_count": 0.0, + "step": 6334, + "text_loss": 0.3995226323604584 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0003834654296480958, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10221862.0, + "repeat_count": 0.0, + "routers_loss": 0.00826781615614891, + "skip_count": 2.0, + "step": 6336, + "text_loss": 0.3534671664237976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003831644595111825, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10224820.0, + "repeat_count": 0.0, + "routers_loss": 0.002143894787877798, + "skip_count": 0.0, + "step": 6338, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 29.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04736328125, + "learning_rate": 0.0003828635341458687, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 10227479.0, + "repeat_count": 0.0, + "routers_loss": 0.012319118715822697, + "skip_count": 2.0, + "step": 6340, + "text_loss": 0.26248639822006226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.774875256824185, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003825626536674697, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10231347.0, + "repeat_count": 0.0, + "routers_loss": 0.00334449321962893, + "skip_count": 0.0, + "step": 6342, + "text_loss": 0.6357201337814331 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.000382261818191283, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10234347.0, + "repeat_count": 0.0, + "routers_loss": 0.0027788348961621523, + "skip_count": 0.0, + "step": 6344, + "text_loss": 0.2813846468925476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00038196102783258996, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10237105.0, + "repeat_count": 0.0, + "routers_loss": 0.001545077539049089, + "skip_count": 0.0, + "step": 6346, + "text_loss": 0.47612661123275757 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0003816602827066537, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 10240249.0, + "repeat_count": 0.0, + "routers_loss": 0.005602670833468437, + "skip_count": 2.0, + "step": 6348, + "text_loss": 0.18197228014469147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003813595829287204, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10243417.0, + "repeat_count": 0.0, + "routers_loss": 0.0004317959537729621, + "skip_count": 0.0, + "step": 6350, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0003810589286140186, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 10246824.0, + "repeat_count": 0.0, + "routers_loss": 0.002225276781246066, + "skip_count": 0.0, + "step": 6352, + "text_loss": 0.14129821956157684 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.831229820956853, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003807583198777599, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 10249836.0, + "repeat_count": 3.0, + "routers_loss": 0.02445496805012226, + "skip_count": 1.0, + "step": 6354, + "text_loss": 0.3237064480781555 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.840622248312297, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00038045775683513786, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10252900.0, + "repeat_count": 0.0, + "routers_loss": 0.0009264222462661564, + "skip_count": 0.0, + "step": 6356, + "text_loss": 0.6777551174163818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 29.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0003801572396013289, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10255526.0, + "repeat_count": 1.0, + "routers_loss": 0.007189550437033176, + "skip_count": 5.0, + "step": 6358, + "text_loss": 0.25438982248306274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.859407103023187, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00037985676829149187, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10258865.0, + "repeat_count": 0.0, + "routers_loss": 0.0014201018493622541, + "skip_count": 0.0, + "step": 6360, + "text_loss": 0.5063154101371765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 29.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0003795563430207678, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10261677.0, + "repeat_count": 0.0, + "routers_loss": 0.0035477925557643175, + "skip_count": 3.0, + "step": 6362, + "text_loss": 0.4815357029438019 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.878191957734078, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003792559639042803, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 10264805.0, + "repeat_count": 0.0, + "routers_loss": 0.013723359443247318, + "skip_count": 1.0, + "step": 6364, + "text_loss": 0.5563676357269287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003789556310571351, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10267885.0, + "repeat_count": 0.0, + "routers_loss": 0.0028159532230347395, + "skip_count": 0.0, + "step": 6366, + "text_loss": 0.7284183502197266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003786553445944204, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10270934.0, + "repeat_count": 0.0, + "routers_loss": 0.0005918835522606969, + "skip_count": 0.0, + "step": 6368, + "text_loss": 0.7387746572494507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.906369239800412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0003783551046312067, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 10273818.0, + "repeat_count": 0.0, + "routers_loss": 0.0011416864581406116, + "skip_count": 0.0, + "step": 6370, + "text_loss": 0.5360285043716431 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 29.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037805491128254645, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 10276494.0, + "repeat_count": 2.0, + "routers_loss": 0.002382483799010515, + "skip_count": 1.0, + "step": 6372, + "text_loss": 0.7536854147911072 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00037775476466347414, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10279719.0, + "repeat_count": 0.0, + "routers_loss": 0.0021104486659169197, + "skip_count": 1.0, + "step": 6374, + "text_loss": 0.6807253956794739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0003774546648890066, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 10283000.0, + "repeat_count": 0.0, + "routers_loss": 0.003148776013404131, + "skip_count": 2.0, + "step": 6376, + "text_loss": 0.30774110555648804 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 29.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003771546120741426, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10285666.0, + "repeat_count": 1.0, + "routers_loss": 0.007700880523771048, + "skip_count": 1.0, + "step": 6378, + "text_loss": 0.4476076364517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 29.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003768546063338631, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10289127.0, + "repeat_count": 0.0, + "routers_loss": 0.0023625255562365055, + "skip_count": 1.0, + "step": 6380, + "text_loss": 0.4350969195365906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0003765546477831307, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10292485.0, + "repeat_count": 0.0, + "routers_loss": 0.001428726245649159, + "skip_count": 0.0, + "step": 6382, + "text_loss": 0.49078530073165894 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 29.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003762547365368902, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10295361.0, + "repeat_count": 0.0, + "routers_loss": 0.0027160397730767727, + "skip_count": 2.0, + "step": 6384, + "text_loss": 0.3476370573043823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 29.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00037595487271006807, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10298717.0, + "repeat_count": 0.0, + "routers_loss": 0.002456068294122815, + "skip_count": 0.0, + "step": 6386, + "text_loss": 0.3634916841983795 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 29.99090108599941, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.021240234375, + "learning_rate": 0.0003756550564175727, + "loss": 0.0049, + "macro_f1": 0.9265305995941162, + "num_tokens": 10302102.0, + "repeat_count": 1.0, + "routers_loss": 0.02546076290309429, + "skip_count": 3.0, + "step": 6388, + "text_loss": 0.2422582060098648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00037535528777429426, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10305060.0, + "repeat_count": 0.0, + "routers_loss": 0.001045907847583294, + "skip_count": 0.0, + "step": 6390, + "text_loss": 0.5563194155693054 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.009392427355444, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0003750555668951045, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10307903.0, + "repeat_count": 1.0, + "routers_loss": 0.007391332648694515, + "skip_count": 2.0, + "step": 6392, + "text_loss": 0.3423991799354553 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00037475589389485744, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 10311396.0, + "repeat_count": 1.0, + "routers_loss": 0.0029360291082412004, + "skip_count": 1.0, + "step": 6394, + "text_loss": 0.9877024292945862 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.028177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00037445626888838807, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10314250.0, + "repeat_count": 0.0, + "routers_loss": 0.0014932662015780807, + "skip_count": 0.0, + "step": 6396, + "text_loss": 0.3978523313999176 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.037569709421778, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003741566919905133, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 10316894.0, + "repeat_count": 1.0, + "routers_loss": 0.007003722712397575, + "skip_count": 5.0, + "step": 6398, + "text_loss": 0.2945566475391388 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00037385716331603155, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10319603.0, + "repeat_count": 1.0, + "routers_loss": 0.006710570305585861, + "skip_count": 1.0, + "step": 6400, + "text_loss": 0.2984389662742615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00037355768297972275, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10322670.0, + "repeat_count": 0.0, + "routers_loss": 0.00048738415353000164, + "skip_count": 0.0, + "step": 6402, + "text_loss": 0.483262300491333 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.065746991488112, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00037325825109634837, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10326280.0, + "repeat_count": 1.0, + "routers_loss": 0.001625525183044374, + "skip_count": 1.0, + "step": 6404, + "text_loss": 0.42678722739219666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.07513941884356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0003729588677806513, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10329008.0, + "repeat_count": 0.0, + "routers_loss": 0.004408636130392551, + "skip_count": 0.0, + "step": 6406, + "text_loss": 0.2264070063829422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.084531846199003, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0003726595331473557, + "loss": 0.0032, + "macro_f1": 0.6666666865348816, + "num_tokens": 10332533.0, + "repeat_count": 0.0, + "routers_loss": 0.0038099216762930155, + "skip_count": 2.0, + "step": 6408, + "text_loss": 0.6670092940330505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.093924273554446, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003723602473111672, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10335643.0, + "repeat_count": 1.0, + "routers_loss": 0.003097689710557461, + "skip_count": 0.0, + "step": 6410, + "text_loss": 0.45228812098503113 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00037206101038677274, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10338522.0, + "repeat_count": 0.0, + "routers_loss": 0.005268602631986141, + "skip_count": 1.0, + "step": 6412, + "text_loss": 0.7288079857826233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.112709128265337, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003717618224888405, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 10341516.0, + "repeat_count": 0.0, + "routers_loss": 0.004640138708055019, + "skip_count": 2.0, + "step": 6414, + "text_loss": 0.22850871086120605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00037146268373201954, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10344831.0, + "repeat_count": 0.0, + "routers_loss": 0.0006379318656399846, + "skip_count": 0.0, + "step": 6416, + "text_loss": 0.7864460945129395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.0003711635942309408, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10348499.0, + "repeat_count": 0.0, + "routers_loss": 0.0004005273221991956, + "skip_count": 0.0, + "step": 6418, + "text_loss": 0.605839192867279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 0.0003708645541002159, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10351722.0, + "repeat_count": 0.0, + "routers_loss": 0.001061634044162929, + "skip_count": 0.0, + "step": 6420, + "text_loss": 0.8226510286331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 30.150278837687114, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003705655634544374, + "loss": 0.0052, + "macro_f1": 0.5492662787437439, + "num_tokens": 10355275.0, + "repeat_count": 0.0, + "routers_loss": 0.013980664312839508, + "skip_count": 2.0, + "step": 6422, + "text_loss": 0.2709597647190094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.159671265042558, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003702666224081792, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10359702.0, + "repeat_count": 1.0, + "routers_loss": 0.0013196271611377597, + "skip_count": 0.0, + "step": 6424, + "text_loss": 0.6451483368873596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00036996773107599604, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10363364.0, + "repeat_count": 0.0, + "routers_loss": 0.0028023163322359324, + "skip_count": 1.0, + "step": 6426, + "text_loss": 0.2770799398422241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0003696688895724235, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10366554.0, + "repeat_count": 0.0, + "routers_loss": 0.0011023655533790588, + "skip_count": 0.0, + "step": 6428, + "text_loss": 0.5466503500938416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.187848547108892, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0003693700980119784, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10369733.0, + "repeat_count": 0.0, + "routers_loss": 0.00230707717128098, + "skip_count": 0.0, + "step": 6430, + "text_loss": 0.45667049288749695 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.19724097446434, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036907135650915824, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10373382.0, + "repeat_count": 0.0, + "routers_loss": 0.0036784098483622074, + "skip_count": 2.0, + "step": 6432, + "text_loss": 0.13856995105743408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.206633401819783, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00036877266517844115, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 10376202.0, + "repeat_count": 0.0, + "routers_loss": 0.0008461157558485866, + "skip_count": 0.0, + "step": 6434, + "text_loss": 0.27238601446151733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.216025829175226, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0003684740241342863, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10380748.0, + "repeat_count": 0.0, + "routers_loss": 0.0052765593864023685, + "skip_count": 0.0, + "step": 6436, + "text_loss": 0.6182295083999634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.225418256530673, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00036817543349113355, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 10386148.0, + "repeat_count": 1.0, + "routers_loss": 0.005562922917306423, + "skip_count": 2.0, + "step": 6438, + "text_loss": 0.5591027140617371 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.234810683886117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0003678768933634033, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10389385.0, + "repeat_count": 0.0, + "routers_loss": 0.0008686366491019726, + "skip_count": 0.0, + "step": 6440, + "text_loss": 0.5158660411834717 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003675784038654968, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10391893.0, + "repeat_count": 0.0, + "routers_loss": 0.0022222092375159264, + "skip_count": 1.0, + "step": 6442, + "text_loss": 0.2865697741508484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.253595538597008, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003672799651117958, + "loss": 0.0099, + "macro_f1": 0.6666666865348816, + "num_tokens": 10395082.0, + "repeat_count": 0.0, + "routers_loss": 0.0030799773521721363, + "skip_count": 2.0, + "step": 6444, + "text_loss": 0.21298295259475708 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003669815772166625, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10398015.0, + "repeat_count": 0.0, + "routers_loss": 0.0035721305757761, + "skip_count": 3.0, + "step": 6446, + "text_loss": 0.5286803841590881 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 30.272380393307895, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00036668324029443975, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10400749.0, + "repeat_count": 0.0, + "routers_loss": 0.00741040613502264, + "skip_count": 4.0, + "step": 6448, + "text_loss": 0.3922366201877594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.281772820663342, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.064453125, + "learning_rate": 0.0003663849544594507, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 10404439.0, + "repeat_count": 0.0, + "routers_loss": 0.002974750241264701, + "skip_count": 2.0, + "step": 6450, + "text_loss": 0.21894219517707825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0458984375, + "learning_rate": 0.00036608671982599927, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10408476.0, + "repeat_count": 0.0, + "routers_loss": 0.004810616374015808, + "skip_count": 0.0, + "step": 6452, + "text_loss": 0.3928622305393219 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0003657885365083694, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10411533.0, + "repeat_count": 1.0, + "routers_loss": 0.005527745466679335, + "skip_count": 0.0, + "step": 6454, + "text_loss": 0.22816279530525208 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.309950102729672, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052734375, + "learning_rate": 0.00036549040462082556, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10414501.0, + "repeat_count": 0.0, + "routers_loss": 0.0021297158673405647, + "skip_count": 0.0, + "step": 6456, + "text_loss": 0.20487719774246216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 30.31934253008512, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003651923242776124, + "loss": 0.0082, + "macro_f1": 0.6592592597007751, + "num_tokens": 10418296.0, + "repeat_count": 1.0, + "routers_loss": 0.046412210911512375, + "skip_count": 5.0, + "step": 6458, + "text_loss": 0.2890419065952301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.328734957440563, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00036489429559295484, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10421211.0, + "repeat_count": 0.0, + "routers_loss": 0.004002603702247143, + "skip_count": 0.0, + "step": 6460, + "text_loss": 0.23165544867515564 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.338127384796007, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003645963186810581, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 10424231.0, + "repeat_count": 0.0, + "routers_loss": 0.003480088198557496, + "skip_count": 1.0, + "step": 6462, + "text_loss": 0.6286683082580566 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003642983936561075, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10427387.0, + "repeat_count": 0.0, + "routers_loss": 0.009358933195471764, + "skip_count": 2.0, + "step": 6464, + "text_loss": 0.3258316218852997 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.356912239506897, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00036400052063226816, + "loss": 0.0048, + "macro_f1": 0.9539539813995361, + "num_tokens": 10430813.0, + "repeat_count": 5.0, + "routers_loss": 0.03567950055003166, + "skip_count": 5.0, + "step": 6466, + "text_loss": 0.7278715968132019 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00036370269972368615, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 10434175.0, + "repeat_count": 1.0, + "routers_loss": 0.00226925453171134, + "skip_count": 2.0, + "step": 6468, + "text_loss": 0.5652450919151306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.375697094217788, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0003634049310444867, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10437393.0, + "repeat_count": 0.0, + "routers_loss": 0.0013644809368997812, + "skip_count": 0.0, + "step": 6470, + "text_loss": 0.5985191464424133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.38508952157323, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003631072147087753, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 10440412.0, + "repeat_count": 0.0, + "routers_loss": 0.0003114990540780127, + "skip_count": 0.0, + "step": 6472, + "text_loss": 0.5588209629058838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.394481948928675, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00036280955083063747, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 10443471.0, + "repeat_count": 0.0, + "routers_loss": 0.0005486322334036231, + "skip_count": 0.0, + "step": 6474, + "text_loss": 0.6969016194343567 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.403874376284122, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00036251193952413865, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 10446548.0, + "repeat_count": 1.0, + "routers_loss": 0.008256378583610058, + "skip_count": 2.0, + "step": 6476, + "text_loss": 0.27083566784858704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.0003622143809033239, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10449478.0, + "repeat_count": 0.0, + "routers_loss": 0.001008771825581789, + "skip_count": 0.0, + "step": 6478, + "text_loss": 0.1689433604478836 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00036191687508221827, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 10453017.0, + "repeat_count": 1.0, + "routers_loss": 0.0014678959269076586, + "skip_count": 0.0, + "step": 6480, + "text_loss": 0.9571998715400696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.432051658350456, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0003616194221748267, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10456061.0, + "repeat_count": 0.0, + "routers_loss": 0.001516164978966117, + "skip_count": 0.0, + "step": 6482, + "text_loss": 0.5750429034233093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.4414440857059, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0003613220222951335, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10459130.0, + "repeat_count": 0.0, + "routers_loss": 0.0031315975356847048, + "skip_count": 0.0, + "step": 6484, + "text_loss": 0.47120073437690735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.450836513061343, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0003610246755571029, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10462190.0, + "repeat_count": 0.0, + "routers_loss": 0.0006079549202695489, + "skip_count": 0.0, + "step": 6486, + "text_loss": 0.8426173329353333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.000360727382074679, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10465233.0, + "repeat_count": 0.0, + "routers_loss": 0.00596054969355464, + "skip_count": 0.0, + "step": 6488, + "text_loss": 0.18435880541801453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.469621367772234, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00036043014196178463, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 10468135.0, + "repeat_count": 0.0, + "routers_loss": 0.008584967814385891, + "skip_count": 1.0, + "step": 6490, + "text_loss": 0.3827758729457855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.479013795127678, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00036013295533232344, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10471032.0, + "repeat_count": 2.0, + "routers_loss": 0.005076571833342314, + "skip_count": 5.0, + "step": 6492, + "text_loss": 0.1215854063630104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 30.488406222483125, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0003598358223001776, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10474779.0, + "repeat_count": 3.0, + "routers_loss": 0.005972118582576513, + "skip_count": 0.0, + "step": 6494, + "text_loss": 0.22768665850162506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.49779864983857, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003595387429792091, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10478015.0, + "repeat_count": 0.0, + "routers_loss": 0.004733685404062271, + "skip_count": 1.0, + "step": 6496, + "text_loss": 0.5013535618782043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.507191077194012, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00035924171748325916, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10481113.0, + "repeat_count": 0.0, + "routers_loss": 0.01148980576545, + "skip_count": 2.0, + "step": 6498, + "text_loss": 0.3281762897968292 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.516583504549455, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003589447459261487, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10484049.0, + "repeat_count": 0.0, + "routers_loss": 0.007726775947958231, + "skip_count": 2.0, + "step": 6500, + "text_loss": 0.46294569969177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.525975931904902, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00035864782842167763, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10487443.0, + "repeat_count": 1.0, + "routers_loss": 0.0013331319205462933, + "skip_count": 0.0, + "step": 6502, + "text_loss": 0.5122153759002686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00035835096508362544, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10490535.0, + "repeat_count": 0.0, + "routers_loss": 0.0011629529763013124, + "skip_count": 0.0, + "step": 6504, + "text_loss": 0.40683525800704956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00035805415602575054, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10493575.0, + "repeat_count": 0.0, + "routers_loss": 0.004780632443726063, + "skip_count": 0.0, + "step": 6506, + "text_loss": 0.37263134121894836 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.554153213971237, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00035775740136179075, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10496193.0, + "repeat_count": 0.0, + "routers_loss": 0.0018355643842369318, + "skip_count": 0.0, + "step": 6508, + "text_loss": 0.2074306458234787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.56354564132668, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00035746070120546314, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10500135.0, + "repeat_count": 0.0, + "routers_loss": 0.004067617934197187, + "skip_count": 1.0, + "step": 6510, + "text_loss": 0.26313406229019165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.572938068682124, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00035716405567046383, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 10503533.0, + "repeat_count": 0.0, + "routers_loss": 0.005438363179564476, + "skip_count": 0.0, + "step": 6512, + "text_loss": 0.3448122441768646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00035686746487046767, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 10506207.0, + "repeat_count": 0.0, + "routers_loss": 0.0012895528925582767, + "skip_count": 0.0, + "step": 6514, + "text_loss": 0.43096476793289185 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003565709289191291, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10509257.0, + "repeat_count": 0.0, + "routers_loss": 0.003141741268336773, + "skip_count": 0.0, + "step": 6516, + "text_loss": 0.22349724173545837 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.601115350748458, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003562744479300811, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10512554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005669888923875988, + "skip_count": 0.0, + "step": 6518, + "text_loss": 0.5319190621376038 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.610507778103905, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00035597802201693587, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 10515720.0, + "repeat_count": 0.0, + "routers_loss": 0.0020814717281609774, + "skip_count": 0.0, + "step": 6520, + "text_loss": 0.20216144621372223 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.61990020545935, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0003556816512932841, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10518517.0, + "repeat_count": 2.0, + "routers_loss": 0.010716461576521397, + "skip_count": 3.0, + "step": 6522, + "text_loss": 0.15843836963176727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.629292632814792, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.0003553853358726959, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10521414.0, + "repeat_count": 0.0, + "routers_loss": 0.0014748790999874473, + "skip_count": 0.0, + "step": 6524, + "text_loss": 0.393892377614975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00035508907586871984, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 10524210.0, + "repeat_count": 0.0, + "routers_loss": 0.0004757299611810595, + "skip_count": 0.0, + "step": 6526, + "text_loss": 0.2557907700538635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.648077487525683, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00035479287139488327, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10527327.0, + "repeat_count": 1.0, + "routers_loss": 0.002445317106321454, + "skip_count": 0.0, + "step": 6528, + "text_loss": 0.48338422179222107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003544967225646922, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10530363.0, + "repeat_count": 0.0, + "routers_loss": 0.0015845977468416095, + "skip_count": 0.0, + "step": 6530, + "text_loss": 0.6474354267120361 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.666862342236573, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00035420062949163166, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10533444.0, + "repeat_count": 0.0, + "routers_loss": 0.002190655330196023, + "skip_count": 0.0, + "step": 6532, + "text_loss": 0.3789777457714081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.676254769592017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0003539045922891649, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10536711.0, + "repeat_count": 0.0, + "routers_loss": 0.00317079434171319, + "skip_count": 0.0, + "step": 6534, + "text_loss": 0.25758084654808044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.68564719694746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00035360861107073394, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 10539849.0, + "repeat_count": 0.0, + "routers_loss": 0.0010938458144664764, + "skip_count": 0.0, + "step": 6536, + "text_loss": 0.9821014404296875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.695039624302908, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003533126859497592, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10543004.0, + "repeat_count": 0.0, + "routers_loss": 0.003071998478844762, + "skip_count": 2.0, + "step": 6538, + "text_loss": 0.6314182281494141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003530168170396401, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 10545965.0, + "repeat_count": 0.0, + "routers_loss": 0.006067665759474039, + "skip_count": 2.0, + "step": 6540, + "text_loss": 0.5021927356719971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.000352721004453754, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 10549188.0, + "repeat_count": 0.0, + "routers_loss": 0.0019109295681118965, + "skip_count": 0.0, + "step": 6542, + "text_loss": 0.3008780777454376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.723216906369238, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00035242524830545683, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10552298.0, + "repeat_count": 0.0, + "routers_loss": 0.007457790896296501, + "skip_count": 3.0, + "step": 6544, + "text_loss": 0.5675695538520813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.732609333724685, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.0003521295487080829, + "loss": 0.0086, + "macro_f1": 0.6666666865348816, + "num_tokens": 10555123.0, + "repeat_count": 0.0, + "routers_loss": 0.007243642583489418, + "skip_count": 1.0, + "step": 6546, + "text_loss": 0.17955881357192993 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00035183390577494476, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10559653.0, + "repeat_count": 0.0, + "routers_loss": 0.004024330526590347, + "skip_count": 0.0, + "step": 6548, + "text_loss": 0.2634682357311249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.751394188435572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 0.0003515383196193336, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10563770.0, + "repeat_count": 1.0, + "routers_loss": 0.010837121866643429, + "skip_count": 0.0, + "step": 6550, + "text_loss": 0.1608252227306366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0003512427903545183, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10567117.0, + "repeat_count": 0.0, + "routers_loss": 0.003473864868283272, + "skip_count": 0.0, + "step": 6552, + "text_loss": 0.231611430644989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.770179043146463, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0003509473180937464, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10570622.0, + "repeat_count": 0.0, + "routers_loss": 0.004441239405423403, + "skip_count": 1.0, + "step": 6554, + "text_loss": 0.3193909227848053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.779571470501907, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506519029502433, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 10573411.0, + "repeat_count": 0.0, + "routers_loss": 0.0008821079391054809, + "skip_count": 0.0, + "step": 6556, + "text_loss": 0.4478783905506134 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.788963897857354, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.0003503565450372128, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10576422.0, + "repeat_count": 1.0, + "routers_loss": 0.0014448441797867417, + "skip_count": 0.0, + "step": 6558, + "text_loss": 0.46065983176231384 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.798356325212797, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003500612444678365, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10579879.0, + "repeat_count": 0.0, + "routers_loss": 0.007939066737890244, + "skip_count": 1.0, + "step": 6560, + "text_loss": 0.3299395740032196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.80774875256824, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.000349766001355274, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10583067.0, + "repeat_count": 0.0, + "routers_loss": 0.010073966346681118, + "skip_count": 2.0, + "step": 6562, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.817141179923688, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00034947081581266335, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10586276.0, + "repeat_count": 0.0, + "routers_loss": 0.0062315030954778194, + "skip_count": 1.0, + "step": 6564, + "text_loss": 0.22706018388271332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003491756879531201, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10589257.0, + "repeat_count": 3.0, + "routers_loss": 0.0023778853937983513, + "skip_count": 4.0, + "step": 6566, + "text_loss": 0.5567800998687744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0003488806178897377, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10592163.0, + "repeat_count": 0.0, + "routers_loss": 0.0004184350254945457, + "skip_count": 0.0, + "step": 6568, + "text_loss": 0.4027897119522095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.845318461990022, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003485856057355876, + "loss": 0.0027, + "macro_f1": 0.6666666865348816, + "num_tokens": 10595326.0, + "repeat_count": 0.0, + "routers_loss": 0.0035254736430943012, + "skip_count": 1.0, + "step": 6570, + "text_loss": 0.3044572174549103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.854710889345466, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000348290651603719, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 10598236.0, + "repeat_count": 0.0, + "routers_loss": 0.0030894684605300426, + "skip_count": 0.0, + "step": 6572, + "text_loss": 0.23021161556243896 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 30.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00034799575560715896, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 10601653.0, + "repeat_count": 1.0, + "routers_loss": 0.0036557347048074007, + "skip_count": 0.0, + "step": 6574, + "text_loss": 0.5437754392623901 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003477009178589121, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10604581.0, + "repeat_count": 2.0, + "routers_loss": 0.021344119682908058, + "skip_count": 4.0, + "step": 6576, + "text_loss": 0.29078927636146545 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 30.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0003474061384719608, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10607676.0, + "repeat_count": 1.0, + "routers_loss": 0.0037169242277741432, + "skip_count": 1.0, + "step": 6578, + "text_loss": 1.1790896654129028 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.892280598767243, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0003471114175592649, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 10611269.0, + "repeat_count": 2.0, + "routers_loss": 0.005873420741409063, + "skip_count": 4.0, + "step": 6580, + "text_loss": 0.36204129457473755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.901673026122687, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0003468167552337624, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 10614335.0, + "repeat_count": 1.0, + "routers_loss": 0.01030842587351799, + "skip_count": 2.0, + "step": 6582, + "text_loss": 0.20400437712669373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.911065453478134, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.00034652215160836826, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10617565.0, + "repeat_count": 0.0, + "routers_loss": 0.0025721401907503605, + "skip_count": 0.0, + "step": 6584, + "text_loss": 0.44676345586776733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 30.920457880833577, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00034622760679597507, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 10620706.0, + "repeat_count": 0.0, + "routers_loss": 0.005751762073487043, + "skip_count": 1.0, + "step": 6586, + "text_loss": 0.4733653664588928 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 30.92985030818902, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00034593312090945306, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10623916.0, + "repeat_count": 0.0, + "routers_loss": 0.0029759553726762533, + "skip_count": 3.0, + "step": 6588, + "text_loss": 0.49876922369003296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.939242735544468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003456386940616498, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10628093.0, + "repeat_count": 0.0, + "routers_loss": 0.0010031822603195906, + "skip_count": 0.0, + "step": 6590, + "text_loss": 0.42708611488342285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00034534432636539004, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10631739.0, + "repeat_count": 0.0, + "routers_loss": 0.0014793311711400747, + "skip_count": 0.0, + "step": 6592, + "text_loss": 0.18193726241588593 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003450500179334762, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10634862.0, + "repeat_count": 0.0, + "routers_loss": 0.0059733521193265915, + "skip_count": 2.0, + "step": 6594, + "text_loss": 0.28596529364585876 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.967420017610802, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003447557688786879, + "loss": 0.0043, + "macro_f1": 0.3272727429866791, + "num_tokens": 10637758.0, + "repeat_count": 0.0, + "routers_loss": 0.0076768649742007256, + "skip_count": 1.0, + "step": 6596, + "text_loss": 0.39428210258483887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.976812444966246, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00034446157931378185, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10640440.0, + "repeat_count": 0.0, + "routers_loss": 0.0015128811355680227, + "skip_count": 0.0, + "step": 6598, + "text_loss": 0.45584383606910706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 30.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00034416744935149193, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10643600.0, + "repeat_count": 0.0, + "routers_loss": 0.000757391273509711, + "skip_count": 0.0, + "step": 6600, + "text_loss": 0.503209114074707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 30.995597299677137, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003438733791045294, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10646907.0, + "repeat_count": 0.0, + "routers_loss": 0.0025944956578314304, + "skip_count": 2.0, + "step": 6602, + "text_loss": 0.4370735287666321 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.00469621367772, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00034357936868558255, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10649995.0, + "repeat_count": 0.0, + "routers_loss": 0.0006543452036567032, + "skip_count": 0.0, + "step": 6604, + "text_loss": 0.4125586748123169 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.014088641033165, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00034328541820731663, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10653251.0, + "repeat_count": 0.0, + "routers_loss": 0.00027016724925488234, + "skip_count": 1.0, + "step": 6606, + "text_loss": 0.7309898734092712 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.023481068388612, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.020751953125, + "learning_rate": 0.00034299152778237413, + "loss": 0.0062, + "macro_f1": 0.8823530077934265, + "num_tokens": 10657229.0, + "repeat_count": 1.0, + "routers_loss": 0.01905548945069313, + "skip_count": 2.0, + "step": 6608, + "text_loss": 0.42367079854011536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0003426976975233744, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10660524.0, + "repeat_count": 0.0, + "routers_loss": 0.0004718089767266065, + "skip_count": 0.0, + "step": 6610, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00034240392754291343, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 10663908.0, + "repeat_count": 1.0, + "routers_loss": 0.0027069442439824343, + "skip_count": 0.0, + "step": 6612, + "text_loss": 0.859471321105957 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.051658350454947, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.000342110217953565, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10667814.0, + "repeat_count": 0.0, + "routers_loss": 0.0015497280983254313, + "skip_count": 0.0, + "step": 6614, + "text_loss": 0.18337638676166534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.06105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0003418165688678788, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10671630.0, + "repeat_count": 0.0, + "routers_loss": 0.0013396464055404067, + "skip_count": 0.0, + "step": 6616, + "text_loss": 0.860016405582428 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.070443205165834, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0003415229803983819, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 10675308.0, + "repeat_count": 0.0, + "routers_loss": 0.007542039267718792, + "skip_count": 3.0, + "step": 6618, + "text_loss": 0.15481022000312805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003412294526575779, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 10678092.0, + "repeat_count": 0.0, + "routers_loss": 0.002029839437454939, + "skip_count": 2.0, + "step": 6620, + "text_loss": 0.5121933221817017 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00034093598575794706, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10681382.0, + "repeat_count": 0.0, + "routers_loss": 0.0013001341139897704, + "skip_count": 0.0, + "step": 6622, + "text_loss": 0.4555061161518097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.098620487232168, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00034064257981194655, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 10684255.0, + "repeat_count": 0.0, + "routers_loss": 0.0007926415419206023, + "skip_count": 0.0, + "step": 6624, + "text_loss": 0.7298227548599243 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.108012914587615, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003403492349320101, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 10686904.0, + "repeat_count": 0.0, + "routers_loss": 0.0021080176811665297, + "skip_count": 1.0, + "step": 6626, + "text_loss": 0.45434215664863586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.11740534194306, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.000340055951230548, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10690311.0, + "repeat_count": 0.0, + "routers_loss": 0.004011874087154865, + "skip_count": 0.0, + "step": 6628, + "text_loss": 0.15496443212032318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.126797769298502, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00033976272881994707, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10693395.0, + "repeat_count": 0.0, + "routers_loss": 0.0031893099658191204, + "skip_count": 2.0, + "step": 6630, + "text_loss": 0.5291517972946167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003394695678125708, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 10697046.0, + "repeat_count": 0.0, + "routers_loss": 0.0033124347683042288, + "skip_count": 1.0, + "step": 6632, + "text_loss": 0.2893230617046356 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.145582624009393, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033917646832075886, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 10700111.0, + "repeat_count": 0.0, + "routers_loss": 0.002547801472246647, + "skip_count": 0.0, + "step": 6634, + "text_loss": 0.10363512486219406 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 31.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003388834304568275, + "loss": 0.0079, + "macro_f1": 0.6666666865348816, + "num_tokens": 10703939.0, + "repeat_count": 2.0, + "routers_loss": 0.0019040531478822231, + "skip_count": 0.0, + "step": 6636, + "text_loss": 0.5185034275054932 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.164367478720283, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00033859045433306975, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 10707187.0, + "repeat_count": 0.0, + "routers_loss": 0.0074104927480220795, + "skip_count": 2.0, + "step": 6638, + "text_loss": 0.1618153154850006 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.173759906075727, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.0003382975400617543, + "loss": 0.0084, + "macro_f1": 0.6666666865348816, + "num_tokens": 10710029.0, + "repeat_count": 0.0, + "routers_loss": 0.0013861875049769878, + "skip_count": 1.0, + "step": 6640, + "text_loss": 0.6674485206604004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.18315233343117, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0003380046877551266, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 10713318.0, + "repeat_count": 0.0, + "routers_loss": 0.0034452753607183695, + "skip_count": 0.0, + "step": 6642, + "text_loss": 0.39299124479293823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.192544760786618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003377118975254082, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 10716130.0, + "repeat_count": 0.0, + "routers_loss": 0.006802885327488184, + "skip_count": 2.0, + "step": 6644, + "text_loss": 0.12942606210708618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.20193718814206, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003374191694847968, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 10719400.0, + "repeat_count": 1.0, + "routers_loss": 0.03718209266662598, + "skip_count": 2.0, + "step": 6646, + "text_loss": 0.34327754378318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0003371265037454663, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10722108.0, + "repeat_count": 0.0, + "routers_loss": 0.006016947794705629, + "skip_count": 2.0, + "step": 6648, + "text_loss": 0.15644726157188416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.220722042852948, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00033683390041956663, + "loss": 0.0075, + "macro_f1": 0.6601307392120361, + "num_tokens": 10725709.0, + "repeat_count": 1.0, + "routers_loss": 0.04308273270726204, + "skip_count": 2.0, + "step": 6650, + "text_loss": 0.1875772923231125 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 31.230114470208395, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003365413596192243, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 10728717.0, + "repeat_count": 2.0, + "routers_loss": 0.006372809875756502, + "skip_count": 1.0, + "step": 6652, + "text_loss": 0.4948291778564453 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00033624888145654137, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10732082.0, + "repeat_count": 0.0, + "routers_loss": 0.0014530479675158858, + "skip_count": 0.0, + "step": 6654, + "text_loss": 0.44932305812835693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.248899324919282, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00033595646604359585, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10734663.0, + "repeat_count": 0.0, + "routers_loss": 0.001924810465425253, + "skip_count": 0.0, + "step": 6656, + "text_loss": 0.45626893639564514 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00033566411349244206, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 10737470.0, + "repeat_count": 1.0, + "routers_loss": 0.0040014320984482765, + "skip_count": 0.0, + "step": 6658, + "text_loss": 0.2700682580471039 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.267684179630173, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00033537182391510996, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10740228.0, + "repeat_count": 0.0, + "routers_loss": 0.0008573737577535212, + "skip_count": 0.0, + "step": 6660, + "text_loss": 0.5626822113990784 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.277076606985617, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003350795974236055, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 10742883.0, + "repeat_count": 0.0, + "routers_loss": 0.011166860349476337, + "skip_count": 1.0, + "step": 6662, + "text_loss": 0.23357805609703064 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 31.286469034341064, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00033478743412991037, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 10746459.0, + "repeat_count": 1.0, + "routers_loss": 0.01719980500638485, + "skip_count": 6.0, + "step": 6664, + "text_loss": 0.150017648935318 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.295861461696507, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00033449533414598223, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 10749984.0, + "repeat_count": 0.0, + "routers_loss": 0.0038280142471194267, + "skip_count": 2.0, + "step": 6666, + "text_loss": 0.6312657594680786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.30525388905195, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033420329758375423, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 10752792.0, + "repeat_count": 0.0, + "routers_loss": 0.0007688060286454856, + "skip_count": 1.0, + "step": 6668, + "text_loss": 0.6794863939285278 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.314646316407398, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00033391132455513537, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10756125.0, + "repeat_count": 0.0, + "routers_loss": 0.003196930279955268, + "skip_count": 2.0, + "step": 6670, + "text_loss": 0.22897565364837646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003336194151720102, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10759296.0, + "repeat_count": 0.0, + "routers_loss": 0.0026212623342871666, + "skip_count": 0.0, + "step": 6672, + "text_loss": 0.5236268639564514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0003333275695462391, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10762574.0, + "repeat_count": 0.0, + "routers_loss": 0.007855101488530636, + "skip_count": 2.0, + "step": 6674, + "text_loss": 0.2971038818359375 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.342823598473732, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0003330357877896577, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 10765758.0, + "repeat_count": 0.0, + "routers_loss": 0.004191791173070669, + "skip_count": 2.0, + "step": 6676, + "text_loss": 0.17358586192131042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.352216025829176, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0003327440700140774, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 10769396.0, + "repeat_count": 0.0, + "routers_loss": 0.004101858474314213, + "skip_count": 1.0, + "step": 6678, + "text_loss": 0.28932204842567444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.000332452416331285, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 10772605.0, + "repeat_count": 0.0, + "routers_loss": 0.0008305918308906257, + "skip_count": 0.0, + "step": 6680, + "text_loss": 0.47090092301368713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0003321608268530427, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 10776576.0, + "repeat_count": 0.0, + "routers_loss": 0.003022305201739073, + "skip_count": 1.0, + "step": 6682, + "text_loss": 0.4467788338661194 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00033186930169108795, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10779648.0, + "repeat_count": 1.0, + "routers_loss": 0.0021474999375641346, + "skip_count": 0.0, + "step": 6684, + "text_loss": 0.6249470710754395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.389785735250953, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.054931640625, + "learning_rate": 0.00033157784095713417, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 10782665.0, + "repeat_count": 0.0, + "routers_loss": 0.0025120675563812256, + "skip_count": 1.0, + "step": 6686, + "text_loss": 0.6763803958892822 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.399178162606397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003312864447628695, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 10785789.0, + "repeat_count": 0.0, + "routers_loss": 0.0013111691223457456, + "skip_count": 1.0, + "step": 6688, + "text_loss": 0.6609058380126953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.408570589961844, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00033099511321995744, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 10788846.0, + "repeat_count": 0.0, + "routers_loss": 0.0012354454956948757, + "skip_count": 0.0, + "step": 6690, + "text_loss": 0.4421829283237457 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.417963017317287, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0003307038464400368, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 10791611.0, + "repeat_count": 0.0, + "routers_loss": 0.0035219944547861814, + "skip_count": 2.0, + "step": 6692, + "text_loss": 0.16222824156284332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.42735544467273, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00033041264453472153, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 10794868.0, + "repeat_count": 1.0, + "routers_loss": 0.0007216202793642879, + "skip_count": 0.0, + "step": 6694, + "text_loss": 0.37388721108436584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 31.436747872028178, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.0003301215076156008, + "loss": 0.0063, + "macro_f1": 0.8803418874740601, + "num_tokens": 10797737.0, + "repeat_count": 2.0, + "routers_loss": 0.025403080508112907, + "skip_count": 7.0, + "step": 6696, + "text_loss": 0.5086690187454224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003298304357942389, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 10800972.0, + "repeat_count": 0.0, + "routers_loss": 0.010532539337873459, + "skip_count": 2.0, + "step": 6698, + "text_loss": 0.22500646114349365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00032953942918217494, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 10803654.0, + "repeat_count": 0.0, + "routers_loss": 0.0009591903653927147, + "skip_count": 0.0, + "step": 6700, + "text_loss": 0.6256277561187744 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.464925154094512, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0003292484878909232, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10807506.0, + "repeat_count": 0.0, + "routers_loss": 0.003801517654210329, + "skip_count": 2.0, + "step": 6702, + "text_loss": 0.522081196308136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.474317581449956, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00032895761203197317, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 10810163.0, + "repeat_count": 0.0, + "routers_loss": 0.002608039416372776, + "skip_count": 2.0, + "step": 6704, + "text_loss": 0.3600201904773712 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00032866680171678874, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10813202.0, + "repeat_count": 0.0, + "routers_loss": 0.0026464913971722126, + "skip_count": 0.0, + "step": 6706, + "text_loss": 0.2513798773288727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.493102436160846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00032837605705680895, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10816484.0, + "repeat_count": 0.0, + "routers_loss": 0.0027157769072800875, + "skip_count": 0.0, + "step": 6708, + "text_loss": 0.34391456842422485 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.047607421875, + "learning_rate": 0.0003280853781634481, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 10819794.0, + "repeat_count": 1.0, + "routers_loss": 0.0016086180694401264, + "skip_count": 1.0, + "step": 6710, + "text_loss": 0.6535179615020752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003277947651480946, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 10823033.0, + "repeat_count": 0.0, + "routers_loss": 0.002368347719311714, + "skip_count": 0.0, + "step": 6712, + "text_loss": 0.5596423745155334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0003275042181221119, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 10826276.0, + "repeat_count": 0.0, + "routers_loss": 0.003124286886304617, + "skip_count": 0.0, + "step": 6714, + "text_loss": 0.6584402322769165 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.530672145582624, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003272137371968382, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 10828846.0, + "repeat_count": 0.0, + "routers_loss": 0.0006088328082114458, + "skip_count": 0.0, + "step": 6716, + "text_loss": 0.4602710008621216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.540064572938068, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00032692332248358645, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10832025.0, + "repeat_count": 0.0, + "routers_loss": 0.002511275466531515, + "skip_count": 2.0, + "step": 6718, + "text_loss": 0.42790886759757996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.000326632974093644, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 10835110.0, + "repeat_count": 1.0, + "routers_loss": 0.01076667383313179, + "skip_count": 0.0, + "step": 6720, + "text_loss": 0.5659847855567932 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.0003263426921382728, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 10838279.0, + "repeat_count": 2.0, + "routers_loss": 0.004973042290657759, + "skip_count": 2.0, + "step": 6722, + "text_loss": 0.675341010093689 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.568241855004402, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00032605247672870964, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 10841381.0, + "repeat_count": 0.0, + "routers_loss": 0.0013990222942084074, + "skip_count": 0.0, + "step": 6724, + "text_loss": 0.5389315485954285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.57763428235985, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00032576232797616554, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10844583.0, + "repeat_count": 0.0, + "routers_loss": 0.003186358604580164, + "skip_count": 1.0, + "step": 6726, + "text_loss": 0.5603348016738892 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.587026709715293, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003254722459918261, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 10847670.0, + "repeat_count": 0.0, + "routers_loss": 0.001443870598450303, + "skip_count": 0.0, + "step": 6728, + "text_loss": 0.6922405362129211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.596419137070736, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0003251822308868512, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 10851479.0, + "repeat_count": 0.0, + "routers_loss": 0.004294445738196373, + "skip_count": 0.0, + "step": 6730, + "text_loss": 0.7145437002182007 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032489228277237514, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10854489.0, + "repeat_count": 0.0, + "routers_loss": 0.0032078945077955723, + "skip_count": 0.0, + "step": 6732, + "text_loss": 0.4077773094177246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.615203991781627, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.00032460240175950664, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10856954.0, + "repeat_count": 1.0, + "routers_loss": 0.0038214854430407286, + "skip_count": 2.0, + "step": 6734, + "text_loss": 0.32071781158447266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0003243125879593286, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 10860016.0, + "repeat_count": 0.0, + "routers_loss": 0.0013407845981419086, + "skip_count": 0.0, + "step": 6736, + "text_loss": 0.45335495471954346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003240228414828984, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 10863021.0, + "repeat_count": 0.0, + "routers_loss": 0.0010989385191351175, + "skip_count": 0.0, + "step": 6738, + "text_loss": 0.562619149684906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003237331624412473, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 10866548.0, + "repeat_count": 0.0, + "routers_loss": 0.006139552686363459, + "skip_count": 0.0, + "step": 6740, + "text_loss": 0.14510060846805573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.652773701203404, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00032344355094538087, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10869402.0, + "repeat_count": 0.0, + "routers_loss": 0.004785746335983276, + "skip_count": 0.0, + "step": 6742, + "text_loss": 0.5655979514122009 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.662166128558848, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00032315400710627876, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10874165.0, + "repeat_count": 0.0, + "routers_loss": 0.0052397786639630795, + "skip_count": 0.0, + "step": 6744, + "text_loss": 0.4785873591899872 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 31.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003228645310348948, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 10876919.0, + "repeat_count": 3.0, + "routers_loss": 0.00460197776556015, + "skip_count": 1.0, + "step": 6746, + "text_loss": 0.5683879256248474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.0003225751228421566, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10880179.0, + "repeat_count": 0.0, + "routers_loss": 0.0032690472435206175, + "skip_count": 0.0, + "step": 6748, + "text_loss": 0.5268497467041016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.690343410625182, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.00032228578263896607, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10883711.0, + "repeat_count": 0.0, + "routers_loss": 0.0036305058747529984, + "skip_count": 0.0, + "step": 6750, + "text_loss": 0.16675594449043274 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.69973583798063, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003219965105361989, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10887041.0, + "repeat_count": 0.0, + "routers_loss": 0.002453352091833949, + "skip_count": 1.0, + "step": 6752, + "text_loss": 0.7010246515274048 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.709128265336073, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00032170730664470465, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 10890053.0, + "repeat_count": 0.0, + "routers_loss": 0.0020381701178848743, + "skip_count": 0.0, + "step": 6754, + "text_loss": 0.46637895703315735 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.718520692691516, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003214181710753069, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 10893501.0, + "repeat_count": 0.0, + "routers_loss": 0.004525696858763695, + "skip_count": 0.0, + "step": 6756, + "text_loss": 0.1768684983253479 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.727913120046964, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003211291039388026, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 10896480.0, + "repeat_count": 1.0, + "routers_loss": 0.0038154330104589462, + "skip_count": 0.0, + "step": 6758, + "text_loss": 0.7908347845077515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.737305547402407, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.00032084010534596326, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 10899158.0, + "repeat_count": 0.0, + "routers_loss": 0.004711449146270752, + "skip_count": 2.0, + "step": 6760, + "text_loss": 0.37209007143974304 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003205511754075335, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 10901791.0, + "repeat_count": 1.0, + "routers_loss": 0.0025003373157233, + "skip_count": 1.0, + "step": 6762, + "text_loss": 0.8081201314926147 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 31.756090402113298, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00032026231423423204, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 10904817.0, + "repeat_count": 0.0, + "routers_loss": 0.007387075573205948, + "skip_count": 3.0, + "step": 6764, + "text_loss": 0.30355480313301086 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.76548282946874, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003199735219367507, + "loss": 0.0061, + "macro_f1": 0.5492662787437439, + "num_tokens": 10908018.0, + "repeat_count": 2.0, + "routers_loss": 0.04275592789053917, + "skip_count": 0.0, + "step": 6766, + "text_loss": 0.26562029123306274 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.774875256824185, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003196847986257553, + "loss": 0.008, + "macro_f1": 0.9255813956260681, + "num_tokens": 10911264.0, + "repeat_count": 3.0, + "routers_loss": 0.034824032336473465, + "skip_count": 4.0, + "step": 6768, + "text_loss": 0.2761698067188263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.784267684179632, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00031939614441188523, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 10915964.0, + "repeat_count": 0.0, + "routers_loss": 0.0011179742868989706, + "skip_count": 0.0, + "step": 6770, + "text_loss": 0.4107927083969116 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00031910755940575344, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 10918678.0, + "repeat_count": 0.0, + "routers_loss": 0.0011521469568833709, + "skip_count": 0.0, + "step": 6772, + "text_loss": 0.43064895272254944 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 31.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.000318819043717946, + "loss": 0.0055, + "macro_f1": 1.0, + "num_tokens": 10921757.0, + "repeat_count": 1.0, + "routers_loss": 0.002861087443307042, + "skip_count": 1.0, + "step": 6774, + "text_loss": 0.5945150852203369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.812444966245963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003185305974590229, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10924767.0, + "repeat_count": 0.0, + "routers_loss": 0.0011365334503352642, + "skip_count": 0.0, + "step": 6776, + "text_loss": 0.36615172028541565 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 31.82183739360141, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0003182422207395171, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 10927750.0, + "repeat_count": 1.0, + "routers_loss": 0.0034391419030725956, + "skip_count": 0.0, + "step": 6778, + "text_loss": 0.17081251740455627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.831229820956853, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003179539136699351, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 10930817.0, + "repeat_count": 0.0, + "routers_loss": 0.004941808991134167, + "skip_count": 2.0, + "step": 6780, + "text_loss": 0.7683762311935425 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 31.840622248312297, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.038330078125, + "learning_rate": 0.00031766567636075675, + "loss": 0.0061, + "macro_f1": 0.8823530077934265, + "num_tokens": 10933882.0, + "repeat_count": 1.0, + "routers_loss": 0.017502857372164726, + "skip_count": 2.0, + "step": 6782, + "text_loss": 0.38010457158088684 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0003173775089224353, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 10936909.0, + "repeat_count": 1.0, + "routers_loss": 0.0035372809506952763, + "skip_count": 2.0, + "step": 6784, + "text_loss": 0.5760656595230103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.859407103023187, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00031708941146539707, + "loss": 0.0061, + "macro_f1": 0.3272727429866791, + "num_tokens": 10940032.0, + "repeat_count": 1.0, + "routers_loss": 0.02229934185743332, + "skip_count": 0.0, + "step": 6786, + "text_loss": 0.5767728090286255 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00031680138410004123, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 10943217.0, + "repeat_count": 0.0, + "routers_loss": 0.0028649091254919767, + "skip_count": 1.0, + "step": 6788, + "text_loss": 0.9756367802619934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.878191957734078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00031651342693674066, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 10947847.0, + "repeat_count": 0.0, + "routers_loss": 0.0039158593863248825, + "skip_count": 2.0, + "step": 6790, + "text_loss": 0.2504335045814514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.88758438508952, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000316225540085841, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 10950879.0, + "repeat_count": 0.0, + "routers_loss": 0.0022091215942054987, + "skip_count": 0.0, + "step": 6792, + "text_loss": 0.525842547416687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.896976812444965, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00031593772365766105, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 10954960.0, + "repeat_count": 0.0, + "routers_loss": 0.0006841494468972087, + "skip_count": 0.0, + "step": 6794, + "text_loss": 0.6383582353591919 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 31.906369239800412, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003156499777624926, + "loss": 0.006, + "macro_f1": 0.9539539813995361, + "num_tokens": 10958278.0, + "repeat_count": 5.0, + "routers_loss": 0.03810702636837959, + "skip_count": 5.0, + "step": 6796, + "text_loss": 0.5901661515235901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0003153623025106005, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 10962412.0, + "repeat_count": 0.0, + "routers_loss": 0.00046833412488922477, + "skip_count": 0.0, + "step": 6798, + "text_loss": 0.42693984508514404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00031507469801222233, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 10966037.0, + "repeat_count": 0.0, + "routers_loss": 0.006818041671067476, + "skip_count": 2.0, + "step": 6800, + "text_loss": 0.5326262712478638 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.934546521866746, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00031478716437756876, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 10969369.0, + "repeat_count": 0.0, + "routers_loss": 0.0029889161232858896, + "skip_count": 0.0, + "step": 6802, + "text_loss": 0.49028220772743225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.94393894922219, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003144997017168232, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 10972016.0, + "repeat_count": 0.0, + "routers_loss": 0.0038266500923782587, + "skip_count": 2.0, + "step": 6804, + "text_loss": 0.43391722440719604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.953331376577633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0003142123101401417, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10975153.0, + "repeat_count": 0.0, + "routers_loss": 0.0005866789724677801, + "skip_count": 0.0, + "step": 6806, + "text_loss": 0.5888382196426392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00031392498975765353, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 10977881.0, + "repeat_count": 0.0, + "routers_loss": 0.002122384263202548, + "skip_count": 0.0, + "step": 6808, + "text_loss": 0.30313390493392944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0003136377406794604, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 10982025.0, + "repeat_count": 0.0, + "routers_loss": 0.0005535652744583786, + "skip_count": 0.0, + "step": 6810, + "text_loss": 0.5788959264755249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 31.981508658643968, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0003133505630156365, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 10985419.0, + "repeat_count": 0.0, + "routers_loss": 0.010623604990541935, + "skip_count": 2.0, + "step": 6812, + "text_loss": 0.18577243387699127 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 31.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00031306345687622905, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 10989116.0, + "repeat_count": 0.0, + "routers_loss": 0.0004721239674836397, + "skip_count": 0.0, + "step": 6814, + "text_loss": 0.4818301200866699 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0003127764223712575, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 10992064.0, + "repeat_count": 0.0, + "routers_loss": 0.0004238430701661855, + "skip_count": 0.0, + "step": 6816, + "text_loss": 0.7482771277427673 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003124894596107141, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 10994903.0, + "repeat_count": 1.0, + "routers_loss": 0.005224394146353006, + "skip_count": 2.0, + "step": 6818, + "text_loss": 0.186603844165802 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00031220256870456356, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 10998692.0, + "repeat_count": 1.0, + "routers_loss": 0.0021751862950623035, + "skip_count": 2.0, + "step": 6820, + "text_loss": 0.45633986592292786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.00031191574976274284, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11001284.0, + "repeat_count": 0.0, + "routers_loss": 0.004747046157717705, + "skip_count": 4.0, + "step": 6822, + "text_loss": 0.5651670694351196 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0003116290028951617, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 11004293.0, + "repeat_count": 0.0, + "routers_loss": 0.0008316585444845259, + "skip_count": 0.0, + "step": 6824, + "text_loss": 0.3167279362678528 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.055419921875, + "learning_rate": 0.000311342328211702, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11007080.0, + "repeat_count": 0.0, + "routers_loss": 0.0004732926026917994, + "skip_count": 0.0, + "step": 6826, + "text_loss": 0.49171411991119385 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000311055725822218, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11010078.0, + "repeat_count": 1.0, + "routers_loss": 0.004238729365170002, + "skip_count": 0.0, + "step": 6828, + "text_loss": 0.21484950184822083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003107691958365361, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11013368.0, + "repeat_count": 0.0, + "routers_loss": 0.0029175232630223036, + "skip_count": 2.0, + "step": 6830, + "text_loss": 0.3718266189098358 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0003104827383644555, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11016704.0, + "repeat_count": 0.0, + "routers_loss": 0.00191891985014081, + "skip_count": 0.0, + "step": 6832, + "text_loss": 0.28772637248039246 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00031019635351574705, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 11019651.0, + "repeat_count": 0.0, + "routers_loss": 0.004300855100154877, + "skip_count": 2.0, + "step": 6834, + "text_loss": 0.6583508849143982 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.000309910041400154, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11023847.0, + "repeat_count": 0.0, + "routers_loss": 0.00037701442488469183, + "skip_count": 0.0, + "step": 6836, + "text_loss": 0.36090534925460815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 32.10331670090989, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0003096238021273917, + "loss": 0.0077, + "macro_f1": 0.9265305995941162, + "num_tokens": 11027804.0, + "repeat_count": 1.0, + "routers_loss": 0.03601725772023201, + "skip_count": 3.0, + "step": 6838, + "text_loss": 0.24180401861667633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.11270912826534, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00030933763580714757, + "loss": 0.0052, + "macro_f1": 0.6601307392120361, + "num_tokens": 11030778.0, + "repeat_count": 1.0, + "routers_loss": 0.023780640214681625, + "skip_count": 2.0, + "step": 6840, + "text_loss": 0.4978102743625641 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030905154254908104, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11034863.0, + "repeat_count": 1.0, + "routers_loss": 0.00565778324380517, + "skip_count": 0.0, + "step": 6842, + "text_loss": 0.558772623538971 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00030876552246282356, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11038488.0, + "repeat_count": 0.0, + "routers_loss": 0.010575232096016407, + "skip_count": 0.0, + "step": 6844, + "text_loss": 0.2955974340438843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003084795756579787, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11041796.0, + "repeat_count": 0.0, + "routers_loss": 0.0015910190995782614, + "skip_count": 0.0, + "step": 6846, + "text_loss": 0.5009704828262329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003081937022441217, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11045141.0, + "repeat_count": 0.0, + "routers_loss": 0.0008034126949496567, + "skip_count": 0.0, + "step": 6848, + "text_loss": 0.3965311646461487 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0003079079023307999, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11047814.0, + "repeat_count": 2.0, + "routers_loss": 0.00810160581022501, + "skip_count": 0.0, + "step": 6850, + "text_loss": 0.24341927468776703 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0003076221760275321, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11051330.0, + "repeat_count": 1.0, + "routers_loss": 0.006590691395103931, + "skip_count": 0.0, + "step": 6852, + "text_loss": 0.5887606739997864 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00030733652344380936, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11055006.0, + "repeat_count": 0.0, + "routers_loss": 0.0005845054984092712, + "skip_count": 0.0, + "step": 6854, + "text_loss": 0.6621366739273071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003070509446890944, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11058470.0, + "repeat_count": 0.0, + "routers_loss": 0.0041051446460187435, + "skip_count": 1.0, + "step": 6856, + "text_loss": 0.31603100895881653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0003067654398728214, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11061620.0, + "repeat_count": 1.0, + "routers_loss": 0.001603201380930841, + "skip_count": 0.0, + "step": 6858, + "text_loss": 0.5167516469955444 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00030648000910439636, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11064727.0, + "repeat_count": 0.0, + "routers_loss": 0.0024816282093524933, + "skip_count": 0.0, + "step": 6860, + "text_loss": 0.5869330167770386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00030619465249319693, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11068208.0, + "repeat_count": 1.0, + "routers_loss": 0.003121294779703021, + "skip_count": 0.0, + "step": 6862, + "text_loss": 0.3920222818851471 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.0003059093701485722, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11071315.0, + "repeat_count": 0.0, + "routers_loss": 0.0033239589538425207, + "skip_count": 1.0, + "step": 6864, + "text_loss": 0.4201887845993042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00030562416217984296, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11074144.0, + "repeat_count": 0.0, + "routers_loss": 0.0016117560444399714, + "skip_count": 0.0, + "step": 6866, + "text_loss": 0.5283045172691345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.0003053390286963015, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11077152.0, + "repeat_count": 0.0, + "routers_loss": 0.003879208816215396, + "skip_count": 0.0, + "step": 6868, + "text_loss": 0.16188788414001465 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00030505396980721143, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11080200.0, + "repeat_count": 0.0, + "routers_loss": 0.007632353343069553, + "skip_count": 1.0, + "step": 6870, + "text_loss": 0.25986847281455994 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00030476898562180793, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11083356.0, + "repeat_count": 0.0, + "routers_loss": 0.004322016146034002, + "skip_count": 2.0, + "step": 6872, + "text_loss": 0.49556297063827515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0003044840762492974, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 11086354.0, + "repeat_count": 0.0, + "routers_loss": 0.0031272871419787407, + "skip_count": 2.0, + "step": 6874, + "text_loss": 0.1658666580915451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003041992417988577, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11088850.0, + "repeat_count": 0.0, + "routers_loss": 0.005371398758143187, + "skip_count": 2.0, + "step": 6876, + "text_loss": 0.22437214851379395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0003039144823796378, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11091784.0, + "repeat_count": 0.0, + "routers_loss": 0.0025086402893066406, + "skip_count": 0.0, + "step": 6878, + "text_loss": 0.7293354868888855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0003036297981007581, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11095204.0, + "repeat_count": 0.0, + "routers_loss": 0.015590827912092209, + "skip_count": 1.0, + "step": 6880, + "text_loss": 0.6406328678131104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0003033451890713103, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11098367.0, + "repeat_count": 0.0, + "routers_loss": 0.0013142531970515847, + "skip_count": 0.0, + "step": 6882, + "text_loss": 0.5209086537361145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003030606554003571, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 11101047.0, + "repeat_count": 2.0, + "routers_loss": 0.0018484699539840221, + "skip_count": 0.0, + "step": 6884, + "text_loss": 0.743188202381134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00030277619719693217, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11104269.0, + "repeat_count": 0.0, + "routers_loss": 0.0016667681047692895, + "skip_count": 0.0, + "step": 6886, + "text_loss": 0.7918420433998108 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0003024918145700406, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 11107248.0, + "repeat_count": 0.0, + "routers_loss": 0.0008098077378235757, + "skip_count": 0.0, + "step": 6888, + "text_loss": 0.3871288299560547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0003022075076286582, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 11111204.0, + "repeat_count": 0.0, + "routers_loss": 0.002324736909940839, + "skip_count": 0.0, + "step": 6890, + "text_loss": 0.3722921907901764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0003019232764817321, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11114363.0, + "repeat_count": 0.0, + "routers_loss": 0.00254769716411829, + "skip_count": 0.0, + "step": 6892, + "text_loss": 0.418519526720047 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00030163912123818006, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11117718.0, + "repeat_count": 0.0, + "routers_loss": 0.000547234492842108, + "skip_count": 0.0, + "step": 6894, + "text_loss": 0.6087009310722351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.0003013550420068909, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11120437.0, + "repeat_count": 0.0, + "routers_loss": 0.00015221568173728883, + "skip_count": 0.0, + "step": 6896, + "text_loss": 0.6013991832733154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.385089521573235, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046142578125, + "learning_rate": 0.00030107103889672436, + "loss": 0.0085, + "macro_f1": 0.5492662787437439, + "num_tokens": 11123708.0, + "repeat_count": 0.0, + "routers_loss": 0.024048971012234688, + "skip_count": 2.0, + "step": 6898, + "text_loss": 0.3612423837184906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0003007871120165111, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 11127294.0, + "repeat_count": 0.0, + "routers_loss": 0.0013236473314464092, + "skip_count": 0.0, + "step": 6900, + "text_loss": 0.5277031064033508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00030050326147505226, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11130270.0, + "repeat_count": 0.0, + "routers_loss": 0.0028277861420065165, + "skip_count": 0.0, + "step": 6902, + "text_loss": 0.5726971626281738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0003002194873811197, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11132955.0, + "repeat_count": 0.0, + "routers_loss": 0.0022369837388396263, + "skip_count": 0.0, + "step": 6904, + "text_loss": 0.18510448932647705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00029993578984345673, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 11136387.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351211696863174, + "skip_count": 0.0, + "step": 6906, + "text_loss": 0.28313153982162476 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0002996521689707764, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11139740.0, + "repeat_count": 0.0, + "routers_loss": 0.00032925375853665173, + "skip_count": 0.0, + "step": 6908, + "text_loss": 0.7315025329589844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002993686248717629, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11142587.0, + "repeat_count": 0.0, + "routers_loss": 0.002886304398998618, + "skip_count": 0.0, + "step": 6910, + "text_loss": 0.677378237247467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029908515765507084, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 11145415.0, + "repeat_count": 1.0, + "routers_loss": 0.0038471966981887817, + "skip_count": 0.0, + "step": 6912, + "text_loss": 0.5207083225250244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002988017674293254, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11148524.0, + "repeat_count": 0.0, + "routers_loss": 0.0023522782139480114, + "skip_count": 0.0, + "step": 6914, + "text_loss": 0.42507871985435486 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0002985184543031222, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11152069.0, + "repeat_count": 0.0, + "routers_loss": 0.0012464249739423394, + "skip_count": 0.0, + "step": 6916, + "text_loss": 0.5694169998168945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.0002982352183850274, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 11155675.0, + "repeat_count": 0.0, + "routers_loss": 0.00828156154602766, + "skip_count": 2.0, + "step": 6918, + "text_loss": 0.22304373979568481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00029795205978357754, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11158555.0, + "repeat_count": 0.0, + "routers_loss": 0.0019234733190387487, + "skip_count": 0.0, + "step": 6920, + "text_loss": 0.5519064664840698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.0002976689786072795, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11161407.0, + "repeat_count": 0.0, + "routers_loss": 0.0003542431222740561, + "skip_count": 0.0, + "step": 6922, + "text_loss": 0.6748810410499573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002973859749646104, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11166007.0, + "repeat_count": 0.0, + "routers_loss": 0.0004024899681098759, + "skip_count": 0.0, + "step": 6924, + "text_loss": 0.6613664627075195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 32.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.000297103048964018, + "loss": 0.0076, + "macro_f1": 0.6666666865348816, + "num_tokens": 11169007.0, + "repeat_count": 0.0, + "routers_loss": 0.005519595462828875, + "skip_count": 3.0, + "step": 6926, + "text_loss": 0.3815552592277527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00029682020071392, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11172939.0, + "repeat_count": 0.0, + "routers_loss": 0.0016999440267682076, + "skip_count": 0.0, + "step": 6928, + "text_loss": 0.6727893352508545 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.535368359260346, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002965374303227044, + "loss": 0.0055, + "macro_f1": 0.5492662787437439, + "num_tokens": 11176232.0, + "repeat_count": 2.0, + "routers_loss": 0.030950307846069336, + "skip_count": 0.0, + "step": 6930, + "text_loss": 0.5577763915061951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029625473789872923, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11179775.0, + "repeat_count": 0.0, + "routers_loss": 0.00525702815502882, + "skip_count": 1.0, + "step": 6932, + "text_loss": 0.5860039591789246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.000295972123550323, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11183262.0, + "repeat_count": 1.0, + "routers_loss": 0.0048187971115112305, + "skip_count": 2.0, + "step": 6934, + "text_loss": 0.7328732013702393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.016357421875, + "learning_rate": 0.00029568958738578364, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11186591.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159632312133908, + "skip_count": 0.0, + "step": 6936, + "text_loss": 0.40563541650772095 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017333984375, + "learning_rate": 0.0002954071295133801, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 11190056.0, + "repeat_count": 1.0, + "routers_loss": 0.011282073333859444, + "skip_count": 1.0, + "step": 6938, + "text_loss": 0.15986496210098267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002951247500413504, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11193504.0, + "repeat_count": 3.0, + "routers_loss": 0.010220487602055073, + "skip_count": 5.0, + "step": 6940, + "text_loss": 0.2604432702064514 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002948424490779029, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11196725.0, + "repeat_count": 0.0, + "routers_loss": 0.002620660001412034, + "skip_count": 1.0, + "step": 6942, + "text_loss": 0.48028868436813354 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00029456022673121597, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11199303.0, + "repeat_count": 0.0, + "routers_loss": 0.00042651945841498673, + "skip_count": 0.0, + "step": 6944, + "text_loss": 0.5135554671287537 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0002942780831094377, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11202319.0, + "repeat_count": 0.0, + "routers_loss": 0.005366047378629446, + "skip_count": 2.0, + "step": 6946, + "text_loss": 0.2809196710586548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002939960183206861, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 11205622.0, + "repeat_count": 0.0, + "routers_loss": 0.0033479216508567333, + "skip_count": 0.0, + "step": 6948, + "text_loss": 0.2013140618801117 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00029371403247304887, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11208637.0, + "repeat_count": 1.0, + "routers_loss": 0.0013508419506251812, + "skip_count": 0.0, + "step": 6950, + "text_loss": 0.4427332580089569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002934321256745833, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11211618.0, + "repeat_count": 0.0, + "routers_loss": 0.0020944071002304554, + "skip_count": 0.0, + "step": 6952, + "text_loss": 0.5406652688980103 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00029315029803331704, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11214432.0, + "repeat_count": 0.0, + "routers_loss": 0.0012655078899115324, + "skip_count": 0.0, + "step": 6954, + "text_loss": 0.7720552086830139 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00029286854965724686, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 11218127.0, + "repeat_count": 0.0, + "routers_loss": 0.009041395038366318, + "skip_count": 0.0, + "step": 6956, + "text_loss": 0.258109986782074 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0002925868806543391, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 11221440.0, + "repeat_count": 1.0, + "routers_loss": 0.0034558263141661882, + "skip_count": 1.0, + "step": 6958, + "text_loss": 0.5378029942512512 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00029230529113253, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11225391.0, + "repeat_count": 0.0, + "routers_loss": 0.005263930186629295, + "skip_count": 2.0, + "step": 6960, + "text_loss": 0.3616539537906647 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0002920237811997251, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11228648.0, + "repeat_count": 0.0, + "routers_loss": 0.003730480559170246, + "skip_count": 1.0, + "step": 6962, + "text_loss": 0.46682238578796387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00029174235096379963, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11231828.0, + "repeat_count": 0.0, + "routers_loss": 0.004831735976040363, + "skip_count": 1.0, + "step": 6964, + "text_loss": 0.5718355178833008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 32.70443205165835, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.046875, + "learning_rate": 0.0002914610005325981, + "loss": 0.0102, + "macro_f1": 0.5492662787437439, + "num_tokens": 11234984.0, + "repeat_count": 0.0, + "routers_loss": 0.03880132734775543, + "skip_count": 2.0, + "step": 6966, + "text_loss": 0.3139013946056366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002911797300139345, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 11239153.0, + "repeat_count": 0.0, + "routers_loss": 0.0006673726020380855, + "skip_count": 0.0, + "step": 6968, + "text_loss": 0.6040399074554443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00029089853951559235, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11242178.0, + "repeat_count": 1.0, + "routers_loss": 0.0028971200808882713, + "skip_count": 0.0, + "step": 6970, + "text_loss": 0.304967999458313 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00029061742914532427, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11245865.0, + "repeat_count": 0.0, + "routers_loss": 0.0010410466929897666, + "skip_count": 0.0, + "step": 6972, + "text_loss": 0.47892290353775024 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.0002903363990108524, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11248806.0, + "repeat_count": 0.0, + "routers_loss": 0.002133697969838977, + "skip_count": 0.0, + "step": 6974, + "text_loss": 0.2561415433883667 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 32.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.0002900554492198677, + "loss": 0.011, + "macro_f1": 0.6666666865348816, + "num_tokens": 11251807.0, + "repeat_count": 2.0, + "routers_loss": 0.002402493730187416, + "skip_count": 0.0, + "step": 6976, + "text_loss": 0.652428388595581 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0002897745798800311, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 11254615.0, + "repeat_count": 1.0, + "routers_loss": 0.006423915736377239, + "skip_count": 0.0, + "step": 6978, + "text_loss": 0.22414511442184448 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.000289493791098972, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11257721.0, + "repeat_count": 0.0, + "routers_loss": 0.002536606043577194, + "skip_count": 0.0, + "step": 6980, + "text_loss": 0.1328018754720688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00028921308298428933, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11260840.0, + "repeat_count": 0.0, + "routers_loss": 0.000745086173992604, + "skip_count": 0.0, + "step": 6982, + "text_loss": 0.61724853515625 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05029296875, + "learning_rate": 0.0002889324556435509, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11264279.0, + "repeat_count": 0.0, + "routers_loss": 0.005258981604129076, + "skip_count": 0.0, + "step": 6984, + "text_loss": 0.1664455235004425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028865190918429356, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11268096.0, + "repeat_count": 0.0, + "routers_loss": 0.0008756023598834872, + "skip_count": 0.0, + "step": 6986, + "text_loss": 0.45111921429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00028837144371402336, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11270611.0, + "repeat_count": 0.0, + "routers_loss": 0.0008175788098014891, + "skip_count": 0.0, + "step": 6988, + "text_loss": 0.5332239270210266 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00028809105934021517, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11273826.0, + "repeat_count": 0.0, + "routers_loss": 0.003494064789265394, + "skip_count": 0.0, + "step": 6990, + "text_loss": 0.20264241099357605 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.82653360727913, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002878107561703127, + "loss": 0.0056, + "macro_f1": 0.8817967176437378, + "num_tokens": 11276917.0, + "repeat_count": 2.0, + "routers_loss": 0.025257345288991928, + "skip_count": 3.0, + "step": 6992, + "text_loss": 0.18000070750713348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.835926034634575, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.0002875305343117289, + "loss": 0.0044, + "macro_f1": 0.6603773832321167, + "num_tokens": 11279637.0, + "repeat_count": 1.0, + "routers_loss": 0.019206687808036804, + "skip_count": 1.0, + "step": 6994, + "text_loss": 0.5872798562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00028725039387184504, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11282717.0, + "repeat_count": 0.0, + "routers_loss": 0.009358765557408333, + "skip_count": 1.0, + "step": 6996, + "text_loss": 0.3412095904350281 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 32.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00028697033495801163, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11285433.0, + "repeat_count": 1.0, + "routers_loss": 0.0038775671273469925, + "skip_count": 1.0, + "step": 6998, + "text_loss": 0.4316727817058563 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 32.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002866903576775475, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11288414.0, + "repeat_count": 1.0, + "routers_loss": 0.004292591474950314, + "skip_count": 0.0, + "step": 7000, + "text_loss": 0.45106515288352966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.873495744056356, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.046875, + "learning_rate": 0.0002864104621377409, + "loss": 0.007, + "macro_f1": 0.6601307392120361, + "num_tokens": 11291811.0, + "repeat_count": 1.0, + "routers_loss": 0.02195967361330986, + "skip_count": 2.0, + "step": 7002, + "text_loss": 0.29841285943984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002861306484458481, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11295179.0, + "repeat_count": 0.0, + "routers_loss": 0.0010119527578353882, + "skip_count": 0.0, + "step": 7004, + "text_loss": 0.5218569040298462 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00028585091670909436, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11298182.0, + "repeat_count": 0.0, + "routers_loss": 0.002615996403619647, + "skip_count": 0.0, + "step": 7006, + "text_loss": 0.20382621884346008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028557126703467316, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 11301262.0, + "repeat_count": 0.0, + "routers_loss": 0.002726050792261958, + "skip_count": 0.0, + "step": 7008, + "text_loss": 0.26718559861183167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002852916995297471, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 11304590.0, + "repeat_count": 0.0, + "routers_loss": 0.0005590448854491115, + "skip_count": 0.0, + "step": 7010, + "text_loss": 0.5392091274261475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028501221430144667, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11307690.0, + "repeat_count": 0.0, + "routers_loss": 0.004541353322565556, + "skip_count": 2.0, + "step": 7012, + "text_loss": 0.16159705817699432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 32.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00028473281145687137, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11310866.0, + "repeat_count": 0.0, + "routers_loss": 0.0029630991630256176, + "skip_count": 1.0, + "step": 7014, + "text_loss": 0.9148072600364685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 28.0, + "epoch": 32.93924273554447, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0302734375, + "learning_rate": 0.0002844534911030888, + "loss": 0.0067, + "macro_f1": 0.9262410998344421, + "num_tokens": 11314517.0, + "repeat_count": 2.0, + "routers_loss": 0.023258809000253677, + "skip_count": 3.0, + "step": 7016, + "text_loss": 0.3853590488433838 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.94863516289991, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.060546875, + "learning_rate": 0.000284174253347135, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 11317526.0, + "repeat_count": 0.0, + "routers_loss": 0.010060093365609646, + "skip_count": 1.0, + "step": 7018, + "text_loss": 0.3412325382232666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00028389509829601444, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 11321684.0, + "repeat_count": 0.0, + "routers_loss": 0.0016713893273845315, + "skip_count": 0.0, + "step": 7020, + "text_loss": 0.9049796462059021 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00028361602605670003, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11324709.0, + "repeat_count": 0.0, + "routers_loss": 0.004167001228779554, + "skip_count": 2.0, + "step": 7022, + "text_loss": 0.24364058673381805 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 32.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00028333703673613224, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11327449.0, + "repeat_count": 0.0, + "routers_loss": 0.0027954576071351767, + "skip_count": 4.0, + "step": 7024, + "text_loss": 0.2872125506401062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 32.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00028305813044122096, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11330846.0, + "repeat_count": 0.0, + "routers_loss": 0.004644687287509441, + "skip_count": 0.0, + "step": 7026, + "text_loss": 0.1717570424079895 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 32.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06884765625, + "learning_rate": 0.00028277930727884336, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11333575.0, + "repeat_count": 0.0, + "routers_loss": 0.00557848671451211, + "skip_count": 2.0, + "step": 7028, + "text_loss": 0.3501792550086975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00028250056735584496, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11336899.0, + "repeat_count": 0.0, + "routers_loss": 0.0005694970604963601, + "skip_count": 0.0, + "step": 7030, + "text_loss": 0.5541794300079346 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00028222191077903946, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11340163.0, + "repeat_count": 0.0, + "routers_loss": 0.0032896639313548803, + "skip_count": 0.0, + "step": 7032, + "text_loss": 0.5618721842765808 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00028194333765520853, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11343494.0, + "repeat_count": 1.0, + "routers_loss": 0.005377276800572872, + "skip_count": 0.0, + "step": 7034, + "text_loss": 0.325153648853302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00028166484809110206, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11346126.0, + "repeat_count": 0.0, + "routers_loss": 0.001204605447128415, + "skip_count": 0.0, + "step": 7036, + "text_loss": 0.5016651749610901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00028138644219343736, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 11348879.0, + "repeat_count": 0.0, + "routers_loss": 0.005026837810873985, + "skip_count": 2.0, + "step": 7038, + "text_loss": 0.2430499643087387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00028110812006890064, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11352457.0, + "repeat_count": 0.0, + "routers_loss": 0.0019850607495754957, + "skip_count": 0.0, + "step": 7040, + "text_loss": 0.42376917600631714 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00028082988182414524, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11356602.0, + "repeat_count": 1.0, + "routers_loss": 0.003362950636073947, + "skip_count": 2.0, + "step": 7042, + "text_loss": 0.4165397882461548 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0002805517275657926, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 11359451.0, + "repeat_count": 0.0, + "routers_loss": 0.0019725612364709377, + "skip_count": 1.0, + "step": 7044, + "text_loss": 0.5597621202468872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.0002802736574004319, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11363614.0, + "repeat_count": 0.0, + "routers_loss": 0.0013963640667498112, + "skip_count": 0.0, + "step": 7046, + "text_loss": 0.6112356185913086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00027999567143462015, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11367015.0, + "repeat_count": 0.0, + "routers_loss": 0.0005658161826431751, + "skip_count": 0.0, + "step": 7048, + "text_loss": 0.4920886754989624 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.09862048723217, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00027971776977488193, + "loss": 0.0064, + "macro_f1": 0.925203263759613, + "num_tokens": 11370489.0, + "repeat_count": 3.0, + "routers_loss": 0.03657131269574165, + "skip_count": 5.0, + "step": 7050, + "text_loss": 0.28003939986228943 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00027943995252771017, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11373614.0, + "repeat_count": 0.0, + "routers_loss": 0.004096088465303183, + "skip_count": 2.0, + "step": 7052, + "text_loss": 0.3145081400871277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00027916221979956457, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11377631.0, + "repeat_count": 0.0, + "routers_loss": 0.0009888096246868372, + "skip_count": 0.0, + "step": 7054, + "text_loss": 0.4898056983947754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.126797769298506, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00027888457169687297, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11380620.0, + "repeat_count": 1.0, + "routers_loss": 0.013347696512937546, + "skip_count": 1.0, + "step": 7056, + "text_loss": 0.7011964917182922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027860700832603056, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11383297.0, + "repeat_count": 0.0, + "routers_loss": 0.000849733711220324, + "skip_count": 1.0, + "step": 7058, + "text_loss": 0.4007014334201813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002783295297934003, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11386460.0, + "repeat_count": 0.0, + "routers_loss": 0.001546313869766891, + "skip_count": 1.0, + "step": 7060, + "text_loss": 0.3992713689804077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002780521362053123, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11389605.0, + "repeat_count": 0.0, + "routers_loss": 0.001045585609972477, + "skip_count": 0.0, + "step": 7062, + "text_loss": 0.4440680146217346 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00027777482766806446, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 11392105.0, + "repeat_count": 1.0, + "routers_loss": 0.00752411549910903, + "skip_count": 0.0, + "step": 7064, + "text_loss": 0.20152349770069122 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 33.17375990607572, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002774976042879218, + "loss": 0.0088, + "macro_f1": 0.5934640765190125, + "num_tokens": 11396142.0, + "repeat_count": 0.0, + "routers_loss": 0.019917849451303482, + "skip_count": 3.0, + "step": 7066, + "text_loss": 0.24365149438381195 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00027722046617111696, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 11398827.0, + "repeat_count": 1.0, + "routers_loss": 0.0015933843096718192, + "skip_count": 0.0, + "step": 7068, + "text_loss": 0.31948477029800415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00027694341342384977, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11402623.0, + "repeat_count": 0.0, + "routers_loss": 0.0018986845389008522, + "skip_count": 2.0, + "step": 7070, + "text_loss": 0.47721394896507263 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00027666644615228727, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11405628.0, + "repeat_count": 0.0, + "routers_loss": 0.002975719515234232, + "skip_count": 1.0, + "step": 7072, + "text_loss": 0.3972358703613281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002763895644625637, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 11409468.0, + "repeat_count": 0.0, + "routers_loss": 0.005657708737999201, + "skip_count": 1.0, + "step": 7074, + "text_loss": 0.6004229187965393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002761127684607811, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11412572.0, + "repeat_count": 0.0, + "routers_loss": 0.0038351903203874826, + "skip_count": 2.0, + "step": 7076, + "text_loss": 1.0837591886520386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00027583605825300795, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 11416831.0, + "repeat_count": 2.0, + "routers_loss": 0.005529445596039295, + "skip_count": 2.0, + "step": 7078, + "text_loss": 0.575986921787262 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00027555943394528014, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11420557.0, + "repeat_count": 0.0, + "routers_loss": 0.006243749521672726, + "skip_count": 0.0, + "step": 7080, + "text_loss": 0.606263279914856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.248899324919286, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00027528289564360064, + "loss": 0.0058, + "macro_f1": 0.6603773832321167, + "num_tokens": 11423471.0, + "repeat_count": 1.0, + "routers_loss": 0.031515009701251984, + "skip_count": 1.0, + "step": 7082, + "text_loss": 0.19393208622932434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0002750064434539394, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11426732.0, + "repeat_count": 0.0, + "routers_loss": 0.0005052287015132606, + "skip_count": 0.0, + "step": 7084, + "text_loss": 0.7202399969100952 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00027473007748223357, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11429391.0, + "repeat_count": 0.0, + "routers_loss": 0.005099403206259012, + "skip_count": 1.0, + "step": 7086, + "text_loss": 0.20651355385780334 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027445379783438685, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 11432161.0, + "repeat_count": 0.0, + "routers_loss": 0.001447655027732253, + "skip_count": 0.0, + "step": 7088, + "text_loss": 0.34758952260017395 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00027417760461627037, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11435417.0, + "repeat_count": 0.0, + "routers_loss": 0.000808655982837081, + "skip_count": 0.0, + "step": 7090, + "text_loss": 0.7414838671684265 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00027390149793372177, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 11438313.0, + "repeat_count": 0.0, + "routers_loss": 0.005151710007339716, + "skip_count": 0.0, + "step": 7092, + "text_loss": 0.17792417109012604 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00027362547789254574, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11441681.0, + "repeat_count": 1.0, + "routers_loss": 0.0037353152874857187, + "skip_count": 3.0, + "step": 7094, + "text_loss": 0.5577781796455383 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.0002733495445985135, + "loss": 0.0026, + "macro_f1": 0.3333333432674408, + "num_tokens": 11444521.0, + "repeat_count": 0.0, + "routers_loss": 0.00038075417978689075, + "skip_count": 0.0, + "step": 7096, + "text_loss": 0.5052862167358398 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.32403874376284, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0002730736981573632, + "loss": 0.0033, + "macro_f1": 0.3272727429866791, + "num_tokens": 11448481.0, + "repeat_count": 0.0, + "routers_loss": 0.007313522044569254, + "skip_count": 1.0, + "step": 7098, + "text_loss": 0.5869139432907104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002727979386748001, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11452164.0, + "repeat_count": 0.0, + "routers_loss": 0.0020673887338489294, + "skip_count": 0.0, + "step": 7100, + "text_loss": 0.4354212284088135 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0002725222662564954, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11455995.0, + "repeat_count": 0.0, + "routers_loss": 0.0008315460290759802, + "skip_count": 0.0, + "step": 7102, + "text_loss": 0.8714128732681274 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.35221602582917, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0002722466810080874, + "loss": 0.0053, + "macro_f1": 0.6603773832321167, + "num_tokens": 11458828.0, + "repeat_count": 1.0, + "routers_loss": 0.010913078673183918, + "skip_count": 1.0, + "step": 7104, + "text_loss": 0.6226683855056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.36160845318462, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002719711830351809, + "loss": 0.0076, + "macro_f1": 0.6603773832321167, + "num_tokens": 11462448.0, + "repeat_count": 1.0, + "routers_loss": 0.040428292006254196, + "skip_count": 1.0, + "step": 7106, + "text_loss": 0.2543688118457794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00027169577244334726, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 11465796.0, + "repeat_count": 0.0, + "routers_loss": 0.004473939072340727, + "skip_count": 1.0, + "step": 7108, + "text_loss": 0.12356872111558914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.043212890625, + "learning_rate": 0.00027142044933812424, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11469176.0, + "repeat_count": 0.0, + "routers_loss": 0.0017961655976250768, + "skip_count": 0.0, + "step": 7110, + "text_loss": 0.6800211668014526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.046142578125, + "learning_rate": 0.0002711452138250162, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11471983.0, + "repeat_count": 2.0, + "routers_loss": 0.003279087832197547, + "skip_count": 2.0, + "step": 7112, + "text_loss": 0.340279757976532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.3991781626064, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00027087006600949403, + "loss": 0.0065, + "macro_f1": 0.6603773832321167, + "num_tokens": 11475656.0, + "repeat_count": 1.0, + "routers_loss": 0.017024178057909012, + "skip_count": 1.0, + "step": 7114, + "text_loss": 0.3556337058544159 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0002705950059969948, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11479410.0, + "repeat_count": 0.0, + "routers_loss": 0.015487123280763626, + "skip_count": 1.0, + "step": 7116, + "text_loss": 0.4404350817203522 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00027032003389292194, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11483302.0, + "repeat_count": 0.0, + "routers_loss": 0.0011217560386285186, + "skip_count": 0.0, + "step": 7118, + "text_loss": 0.46771445870399475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.0002700451498026454, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11486212.0, + "repeat_count": 0.0, + "routers_loss": 0.0010832607513293624, + "skip_count": 0.0, + "step": 7120, + "text_loss": 0.6795281767845154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00026977035383150106, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11489320.0, + "repeat_count": 0.0, + "routers_loss": 0.002290027216076851, + "skip_count": 1.0, + "step": 7122, + "text_loss": 0.5304523706436157 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 33.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00026949564608479164, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11492056.0, + "repeat_count": 2.0, + "routers_loss": 0.009950211271643639, + "skip_count": 6.0, + "step": 7124, + "text_loss": 0.21328973770141602 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0002692210266677855, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 11495165.0, + "repeat_count": 0.0, + "routers_loss": 0.0079165268689394, + "skip_count": 3.0, + "step": 7126, + "text_loss": 0.19840657711029053 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00026894649568571724, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11497636.0, + "repeat_count": 0.0, + "routers_loss": 0.0013852717820554972, + "skip_count": 0.0, + "step": 7128, + "text_loss": 0.3360055088996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00026867205324378776, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11500806.0, + "repeat_count": 0.0, + "routers_loss": 0.0010151927126571536, + "skip_count": 0.0, + "step": 7130, + "text_loss": 0.6827390193939209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00026839769944716373, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11504187.0, + "repeat_count": 0.0, + "routers_loss": 0.001110393786802888, + "skip_count": 0.0, + "step": 7132, + "text_loss": 0.5081584453582764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0002681234344009783, + "loss": 0.0071, + "macro_f1": 0.3272727429866791, + "num_tokens": 11507900.0, + "repeat_count": 0.0, + "routers_loss": 0.010587670840322971, + "skip_count": 1.0, + "step": 7134, + "text_loss": 0.28684356808662415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00026784925821033014, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11510627.0, + "repeat_count": 0.0, + "routers_loss": 0.006658690981566906, + "skip_count": 0.0, + "step": 7136, + "text_loss": 0.24232104420661926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00026757517098028417, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 11513304.0, + "repeat_count": 0.0, + "routers_loss": 0.0014556109672412276, + "skip_count": 0.0, + "step": 7138, + "text_loss": 0.4718358516693115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 33.52127971822718, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00026730117281587116, + "loss": 0.0062, + "macro_f1": 0.9265305995941162, + "num_tokens": 11516593.0, + "repeat_count": 1.0, + "routers_loss": 0.01590067707002163, + "skip_count": 3.0, + "step": 7140, + "text_loss": 0.2810344696044922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00026702726382208774, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11519776.0, + "repeat_count": 0.0, + "routers_loss": 0.0014479428064078093, + "skip_count": 0.0, + "step": 7142, + "text_loss": 0.48876339197158813 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00026675344410389623, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11522499.0, + "repeat_count": 0.0, + "routers_loss": 0.003729258431121707, + "skip_count": 2.0, + "step": 7144, + "text_loss": 0.5350890755653381 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0002664797137662248, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11525220.0, + "repeat_count": 1.0, + "routers_loss": 0.0015156447188928723, + "skip_count": 1.0, + "step": 7146, + "text_loss": 0.5742373466491699 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00026620607291396773, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 11527926.0, + "repeat_count": 2.0, + "routers_loss": 0.004842780064791441, + "skip_count": 2.0, + "step": 7148, + "text_loss": 0.4994547665119171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00026593252165198455, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 11531622.0, + "repeat_count": 0.0, + "routers_loss": 0.0026556351222097874, + "skip_count": 0.0, + "step": 7150, + "text_loss": 0.1567893922328949 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00026565906008510064, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11535191.0, + "repeat_count": 0.0, + "routers_loss": 0.008135059848427773, + "skip_count": 1.0, + "step": 7152, + "text_loss": 0.289173424243927 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.000265385688318107, + "loss": 0.0083, + "macro_f1": 1.0, + "num_tokens": 11539060.0, + "repeat_count": 1.0, + "routers_loss": 0.0020754633005708456, + "skip_count": 1.0, + "step": 7154, + "text_loss": 0.35089045763015747 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002651124064557602, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11541662.0, + "repeat_count": 1.0, + "routers_loss": 0.0023738413583487272, + "skip_count": 0.0, + "step": 7156, + "text_loss": 0.5026801228523254 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00026483921460278227, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11544763.0, + "repeat_count": 0.0, + "routers_loss": 0.003311366541311145, + "skip_count": 1.0, + "step": 7158, + "text_loss": 0.22975654900074005 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.0002645661128638609, + "loss": 0.0072, + "macro_f1": 0.3333333432674408, + "num_tokens": 11547649.0, + "repeat_count": 0.0, + "routers_loss": 0.0008209354127757251, + "skip_count": 0.0, + "step": 7160, + "text_loss": 0.32840636372566223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00026429310134364926, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 11550648.0, + "repeat_count": 0.0, + "routers_loss": 0.0028574815951287746, + "skip_count": 0.0, + "step": 7162, + "text_loss": 0.23239612579345703 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00026402018014676584, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11553790.0, + "repeat_count": 0.0, + "routers_loss": 0.005469404626637697, + "skip_count": 1.0, + "step": 7164, + "text_loss": 0.22877025604248047 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.0002637473493777943, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11556802.0, + "repeat_count": 1.0, + "routers_loss": 0.0032242932356894016, + "skip_count": 2.0, + "step": 7166, + "text_loss": 0.6376226544380188 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00026347460914128443, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 11559607.0, + "repeat_count": 1.0, + "routers_loss": 0.0040627880953252316, + "skip_count": 2.0, + "step": 7168, + "text_loss": 0.6879657506942749 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00026320195954175043, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 11562677.0, + "repeat_count": 2.0, + "routers_loss": 0.020494163036346436, + "skip_count": 4.0, + "step": 7170, + "text_loss": 0.3710069954395294 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06787109375, + "learning_rate": 0.00026292940068367224, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11565948.0, + "repeat_count": 0.0, + "routers_loss": 0.002662271959707141, + "skip_count": 0.0, + "step": 7172, + "text_loss": 0.15041157603263855 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00026265693267149494, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11568836.0, + "repeat_count": 0.0, + "routers_loss": 0.0039914860390126705, + "skip_count": 1.0, + "step": 7174, + "text_loss": 0.5372130870819092 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 0.00026238455560962884, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11572542.0, + "repeat_count": 0.0, + "routers_loss": 0.0034708199091255665, + "skip_count": 0.0, + "step": 7176, + "text_loss": 0.2956286072731018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026211226960244914, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11575352.0, + "repeat_count": 0.0, + "routers_loss": 0.007794995326548815, + "skip_count": 2.0, + "step": 7178, + "text_loss": 0.3691073954105377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0002618400747542964, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11579110.0, + "repeat_count": 0.0, + "routers_loss": 0.0009694626205600798, + "skip_count": 0.0, + "step": 7180, + "text_loss": 0.6523211598396301 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002615679711694764, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 11582476.0, + "repeat_count": 0.0, + "routers_loss": 0.004227840341627598, + "skip_count": 1.0, + "step": 7182, + "text_loss": 0.1997286081314087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00026129595895225965, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 11585685.0, + "repeat_count": 0.0, + "routers_loss": 0.00126146269030869, + "skip_count": 0.0, + "step": 7184, + "text_loss": 0.486299604177475 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.73730554740241, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.0002610240382068818, + "loss": 0.006, + "macro_f1": 0.8814815282821655, + "num_tokens": 11588804.0, + "repeat_count": 2.0, + "routers_loss": 0.04553814232349396, + "skip_count": 4.0, + "step": 7186, + "text_loss": 0.1622236669063568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00026075220903754324, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11591822.0, + "repeat_count": 0.0, + "routers_loss": 0.002460496500134468, + "skip_count": 2.0, + "step": 7188, + "text_loss": 0.5573232173919678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002604804715484095, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11594899.0, + "repeat_count": 0.0, + "routers_loss": 0.006854622159153223, + "skip_count": 1.0, + "step": 7190, + "text_loss": 0.4753095507621765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00026020882584361094, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11598333.0, + "repeat_count": 0.0, + "routers_loss": 0.001945660449564457, + "skip_count": 1.0, + "step": 7192, + "text_loss": 0.8912903666496277 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 31.0, + "epoch": 33.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.061767578125, + "learning_rate": 0.0002599372720272426, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 11601814.0, + "repeat_count": 4.0, + "routers_loss": 0.005749753676354885, + "skip_count": 1.0, + "step": 7194, + "text_loss": 0.6041871905326843 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002596658102033643, + "loss": 0.0097, + "macro_f1": 0.6666666865348816, + "num_tokens": 11604661.0, + "repeat_count": 0.0, + "routers_loss": 0.0025942171923816204, + "skip_count": 1.0, + "step": 7196, + "text_loss": 0.4760607182979584 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 33.793660111535075, + "f1_execute": 0.9756097793579102, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00025939444047600114, + "loss": 0.0075, + "macro_f1": 0.8807588815689087, + "num_tokens": 11608459.0, + "repeat_count": 2.0, + "routers_loss": 0.020141327753663063, + "skip_count": 6.0, + "step": 7198, + "text_loss": 0.6670252084732056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0002591231629491423, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 11611489.0, + "repeat_count": 0.0, + "routers_loss": 0.005721202120184898, + "skip_count": 1.0, + "step": 7200, + "text_loss": 0.31318753957748413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00025885197772674174, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11615234.0, + "repeat_count": 0.0, + "routers_loss": 0.0027279339265078306, + "skip_count": 1.0, + "step": 7202, + "text_loss": 0.25728851556777954 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00025858088491271825, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11618892.0, + "repeat_count": 0.0, + "routers_loss": 0.0006987092201597989, + "skip_count": 0.0, + "step": 7204, + "text_loss": 0.5504243969917297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00025830988461095504, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11622237.0, + "repeat_count": 0.0, + "routers_loss": 0.0029056845232844353, + "skip_count": 0.0, + "step": 7206, + "text_loss": 0.5319080948829651 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0002580389769253001, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11624713.0, + "repeat_count": 4.0, + "routers_loss": 0.007346974220126867, + "skip_count": 5.0, + "step": 7208, + "text_loss": 0.8925374746322632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0002577681619595655, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11628689.0, + "repeat_count": 0.0, + "routers_loss": 0.0004166684520896524, + "skip_count": 0.0, + "step": 7210, + "text_loss": 0.37282413244247437 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00025749743981752824, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 11631581.0, + "repeat_count": 0.0, + "routers_loss": 0.013194780796766281, + "skip_count": 2.0, + "step": 7212, + "text_loss": 0.220115065574646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.0002572268106029295, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11634503.0, + "repeat_count": 0.0, + "routers_loss": 0.0009112557163462043, + "skip_count": 0.0, + "step": 7214, + "text_loss": 0.5631879568099976 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00025695627441947496, + "loss": 0.0075, + "macro_f1": 0.6666666865348816, + "num_tokens": 11637790.0, + "repeat_count": 0.0, + "routers_loss": 0.011178883723914623, + "skip_count": 2.0, + "step": 7216, + "text_loss": 0.24482154846191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.887584385089525, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00025668583137083447, + "loss": 0.0047, + "macro_f1": 0.32098764181137085, + "num_tokens": 11640806.0, + "repeat_count": 0.0, + "routers_loss": 0.01877705194056034, + "skip_count": 2.0, + "step": 7218, + "text_loss": 0.2229214459657669 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0002564154815606422, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11644479.0, + "repeat_count": 0.0, + "routers_loss": 0.0030277224723249674, + "skip_count": 0.0, + "step": 7220, + "text_loss": 0.6025711894035339 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00025614522509249715, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11647340.0, + "repeat_count": 0.0, + "routers_loss": 0.002354414900764823, + "skip_count": 1.0, + "step": 7222, + "text_loss": 0.6497155427932739 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002558750620699618, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11650433.0, + "repeat_count": 1.0, + "routers_loss": 0.009801039472222328, + "skip_count": 2.0, + "step": 7224, + "text_loss": 0.32049307227134705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002556049925965632, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11654451.0, + "repeat_count": 0.0, + "routers_loss": 0.002949854824692011, + "skip_count": 0.0, + "step": 7226, + "text_loss": 0.17923395335674286 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00025533501677579254, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 11657440.0, + "repeat_count": 1.0, + "routers_loss": 0.0032915703486651182, + "skip_count": 1.0, + "step": 7228, + "text_loss": 0.60064297914505 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 33.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0002550651347111049, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 11660599.0, + "repeat_count": 1.0, + "routers_loss": 0.00594533933326602, + "skip_count": 1.0, + "step": 7230, + "text_loss": 0.32829397916793823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 33.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00025479534650591976, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11663387.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214308466762304, + "skip_count": 0.0, + "step": 7232, + "text_loss": 0.7317177653312683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 33.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00025452565226362036, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11666729.0, + "repeat_count": 0.0, + "routers_loss": 0.0056374757550656796, + "skip_count": 2.0, + "step": 7234, + "text_loss": 0.3394623398780823 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 33.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0152587890625, + "learning_rate": 0.00025425605208755406, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11669871.0, + "repeat_count": 0.0, + "routers_loss": 0.006422565318644047, + "skip_count": 3.0, + "step": 7236, + "text_loss": 0.1725512444972992 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 33.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002539865460810322, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 11673008.0, + "repeat_count": 1.0, + "routers_loss": 0.0023537934757769108, + "skip_count": 0.0, + "step": 7238, + "text_loss": 0.8873519897460938 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 33.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00025371713434733, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 11675988.0, + "repeat_count": 0.0, + "routers_loss": 0.0026300614699721336, + "skip_count": 1.0, + "step": 7240, + "text_loss": 0.4877084195613861 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 34.0, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002534478169896864, + "loss": 0.0052, + "macro_f1": 0.9265305995941162, + "num_tokens": 11679068.0, + "repeat_count": 1.0, + "routers_loss": 0.019549336284399033, + "skip_count": 3.0, + "step": 7242, + "text_loss": 0.15101417899131775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002531785941113044, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 11682205.0, + "repeat_count": 0.0, + "routers_loss": 0.007769173942506313, + "skip_count": 1.0, + "step": 7244, + "text_loss": 0.4035153090953827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0002529094658153508, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 11685162.0, + "repeat_count": 0.0, + "routers_loss": 0.003636054927483201, + "skip_count": 0.0, + "step": 7246, + "text_loss": 0.21048080921173096 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048583984375, + "learning_rate": 0.00025264043220495606, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 11688512.0, + "repeat_count": 0.0, + "routers_loss": 0.0013363865436986089, + "skip_count": 0.0, + "step": 7248, + "text_loss": 0.6582038402557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00025237149338321437, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 11691753.0, + "repeat_count": 0.0, + "routers_loss": 0.0005587349878624082, + "skip_count": 0.0, + "step": 7250, + "text_loss": 0.6899203658103943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0002521026494531835, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11694689.0, + "repeat_count": 1.0, + "routers_loss": 0.006221035961061716, + "skip_count": 0.0, + "step": 7252, + "text_loss": 0.17377600073814392 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.000251833900517885, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 11697950.0, + "repeat_count": 0.0, + "routers_loss": 0.004368607886135578, + "skip_count": 1.0, + "step": 7254, + "text_loss": 0.4147649109363556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.000251565246680304, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11701214.0, + "repeat_count": 0.0, + "routers_loss": 0.0038269520737230778, + "skip_count": 2.0, + "step": 7256, + "text_loss": 0.42076823115348816 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00025129668804338906, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11703935.0, + "repeat_count": 0.0, + "routers_loss": 0.0011755652958527207, + "skip_count": 0.0, + "step": 7258, + "text_loss": 0.5484340190887451 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00025102822471005247, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 11706818.0, + "repeat_count": 1.0, + "routers_loss": 0.00735129788517952, + "skip_count": 2.0, + "step": 7260, + "text_loss": 0.29214802384376526 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00025075985678316983, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 11709979.0, + "repeat_count": 1.0, + "routers_loss": 0.0011552777141332626, + "skip_count": 0.0, + "step": 7262, + "text_loss": 0.6514551639556885 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.10331670090989, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002504915843655802, + "loss": 0.0067, + "macro_f1": 0.8814815282821655, + "num_tokens": 11714075.0, + "repeat_count": 2.0, + "routers_loss": 0.01438678614795208, + "skip_count": 4.0, + "step": 7264, + "text_loss": 0.5144859552383423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002502234075600862, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11717610.0, + "repeat_count": 0.0, + "routers_loss": 0.0027831171173602343, + "skip_count": 0.0, + "step": 7266, + "text_loss": 0.6494308114051819 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00024995532646945336, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 11721415.0, + "repeat_count": 0.0, + "routers_loss": 0.0012327058939263225, + "skip_count": 0.0, + "step": 7268, + "text_loss": 0.5111991763114929 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 34.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002496873411964113, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 11724488.0, + "repeat_count": 2.0, + "routers_loss": 0.003060065908357501, + "skip_count": 1.0, + "step": 7270, + "text_loss": 0.5780492424964905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.0002494194518436523, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11727708.0, + "repeat_count": 0.0, + "routers_loss": 0.001369593315757811, + "skip_count": 0.0, + "step": 7272, + "text_loss": 0.3151950240135193 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00024915165851383203, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11730897.0, + "repeat_count": 0.0, + "routers_loss": 0.005724756047129631, + "skip_count": 0.0, + "step": 7274, + "text_loss": 0.5267965197563171 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00024888396130956947, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11733870.0, + "repeat_count": 1.0, + "routers_loss": 0.010036137886345387, + "skip_count": 0.0, + "step": 7276, + "text_loss": 0.5330777168273926 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00024861636033344657, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11737413.0, + "repeat_count": 0.0, + "routers_loss": 0.008341848850250244, + "skip_count": 2.0, + "step": 7278, + "text_loss": 0.25949522852897644 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0002483488556880087, + "loss": 0.0061, + "macro_f1": 1.0, + "num_tokens": 11740691.0, + "repeat_count": 1.0, + "routers_loss": 0.008208763785660267, + "skip_count": 2.0, + "step": 7280, + "text_loss": 0.1867891401052475 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.000248081447475764, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 11743715.0, + "repeat_count": 0.0, + "routers_loss": 0.0038434381131082773, + "skip_count": 0.0, + "step": 7282, + "text_loss": 0.4835410416126251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002478141357991838, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 11746818.0, + "repeat_count": 0.0, + "routers_loss": 0.0019067893736064434, + "skip_count": 0.0, + "step": 7284, + "text_loss": 0.5959038734436035 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00024754692076070256, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 11750160.0, + "repeat_count": 0.0, + "routers_loss": 0.007199060171842575, + "skip_count": 0.0, + "step": 7286, + "text_loss": 0.5068115592002869 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002472798024627175, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11752836.0, + "repeat_count": 0.0, + "routers_loss": 0.0014214382972568274, + "skip_count": 0.0, + "step": 7288, + "text_loss": 0.5742631554603577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002470127810075889, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11756276.0, + "repeat_count": 0.0, + "routers_loss": 0.0018025166355073452, + "skip_count": 0.0, + "step": 7290, + "text_loss": 0.6616888642311096 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00024674585649763983, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 11760235.0, + "repeat_count": 1.0, + "routers_loss": 0.0024077212437987328, + "skip_count": 0.0, + "step": 7292, + "text_loss": 0.7984768748283386 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024647902903515614, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 11763430.0, + "repeat_count": 0.0, + "routers_loss": 0.007843999192118645, + "skip_count": 1.0, + "step": 7294, + "text_loss": 0.1943647861480713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0002462122987223869, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 11766583.0, + "repeat_count": 0.0, + "routers_loss": 0.0019727738108485937, + "skip_count": 0.0, + "step": 7296, + "text_loss": 0.43924200534820557 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6000000238418579, + "avg_layers": 27.0, + "epoch": 34.26298796595245, + "f1_execute": 0.9545454382896423, + "f1_repeat": 1.0, + "f1_skip": 0.75, + "grad_norm": 0.041015625, + "learning_rate": 0.0002459456656615436, + "loss": 0.0069, + "macro_f1": 0.9015151858329773, + "num_tokens": 11770360.0, + "repeat_count": 2.0, + "routers_loss": 0.04594529792666435, + "skip_count": 5.0, + "step": 7298, + "text_loss": 0.32582250237464905 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0002456791299548004, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 11773239.0, + "repeat_count": 1.0, + "routers_loss": 0.0011880286037921906, + "skip_count": 0.0, + "step": 7300, + "text_loss": 0.7723727226257324 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024541269170429435, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11776945.0, + "repeat_count": 0.0, + "routers_loss": 0.0010577787179499865, + "skip_count": 0.0, + "step": 7302, + "text_loss": 0.8173839449882507 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0002451463510121252, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11780121.0, + "repeat_count": 0.0, + "routers_loss": 0.0019757342524826527, + "skip_count": 0.0, + "step": 7304, + "text_loss": 0.4015064239501953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000244880107980355, + "loss": 0.0106, + "macro_f1": 0.3333333432674408, + "num_tokens": 11783172.0, + "repeat_count": 0.0, + "routers_loss": 0.002577328821644187, + "skip_count": 0.0, + "step": 7306, + "text_loss": 0.5465171933174133 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 0.00024461396271100876, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 11788608.0, + "repeat_count": 0.0, + "routers_loss": 0.004162502940744162, + "skip_count": 0.0, + "step": 7308, + "text_loss": 0.2419646978378296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 0.0002443479153060735, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11791912.0, + "repeat_count": 0.0, + "routers_loss": 0.003301614662632346, + "skip_count": 0.0, + "step": 7310, + "text_loss": 0.2568489909172058 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00024408196586749964, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 11794849.0, + "repeat_count": 0.0, + "routers_loss": 0.0019893983844667673, + "skip_count": 0.0, + "step": 7312, + "text_loss": 0.7044196128845215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0002438161144971992, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11797587.0, + "repeat_count": 0.0, + "routers_loss": 0.006637922488152981, + "skip_count": 1.0, + "step": 7314, + "text_loss": 0.6863232254981995 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.000243550361297047, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 11800173.0, + "repeat_count": 0.0, + "routers_loss": 0.003078785724937916, + "skip_count": 2.0, + "step": 7316, + "text_loss": 0.2868897616863251 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00024328470636888005, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11802889.0, + "repeat_count": 0.0, + "routers_loss": 0.0011882453691214323, + "skip_count": 0.0, + "step": 7318, + "text_loss": 0.5522798299789429 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0002430191498144979, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 11805607.0, + "repeat_count": 0.0, + "routers_loss": 0.0008720619371160865, + "skip_count": 0.0, + "step": 7320, + "text_loss": 0.5531370639801025 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00024275369173566236, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 11808838.0, + "repeat_count": 1.0, + "routers_loss": 0.003213440766558051, + "skip_count": 0.0, + "step": 7322, + "text_loss": 0.5252627730369568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.00024248833223409715, + "loss": 0.0102, + "macro_f1": 0.6666666865348816, + "num_tokens": 11811965.0, + "repeat_count": 0.0, + "routers_loss": 0.004736232105642557, + "skip_count": 1.0, + "step": 7324, + "text_loss": 0.6033701300621033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00024222307141148907, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11814832.0, + "repeat_count": 0.0, + "routers_loss": 0.0007559265359304845, + "skip_count": 0.0, + "step": 7326, + "text_loss": 0.5607737302780151 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00024195790936948626, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 11818802.0, + "repeat_count": 0.0, + "routers_loss": 0.005338212475180626, + "skip_count": 2.0, + "step": 7328, + "text_loss": 0.20618735253810883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002416928462096994, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11821998.0, + "repeat_count": 0.0, + "routers_loss": 0.001919696107506752, + "skip_count": 3.0, + "step": 7330, + "text_loss": 0.42486369609832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00024142788203370107, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0013797834981232882, + "skip_count": 0.0, + "step": 7332, + "text_loss": 0.48403388261795044 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.43205165835045, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00024116301694302621, + "loss": 0.0053, + "macro_f1": 0.3272727429866791, + "num_tokens": 11828504.0, + "repeat_count": 0.0, + "routers_loss": 0.008978237397968769, + "skip_count": 1.0, + "step": 7334, + "text_loss": 0.43872755765914917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00024089825103917152, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 11831171.0, + "repeat_count": 0.0, + "routers_loss": 0.004589964635670185, + "skip_count": 1.0, + "step": 7336, + "text_loss": 0.5126842260360718 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00024063358442359572, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11834387.0, + "repeat_count": 0.0, + "routers_loss": 0.002857893006876111, + "skip_count": 0.0, + "step": 7338, + "text_loss": 0.7521272301673889 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.0002403690171977197, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 11838693.0, + "repeat_count": 0.0, + "routers_loss": 0.0009023012826219201, + "skip_count": 0.0, + "step": 7340, + "text_loss": 0.6335242390632629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00024010454946292586, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11841882.0, + "repeat_count": 1.0, + "routers_loss": 0.010992717929184437, + "skip_count": 0.0, + "step": 7342, + "text_loss": 0.64045649766922 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002398401813205592, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11845181.0, + "repeat_count": 0.0, + "routers_loss": 0.002247930970042944, + "skip_count": 2.0, + "step": 7344, + "text_loss": 0.31022098660469055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00023957591287192577, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 11848537.0, + "repeat_count": 0.0, + "routers_loss": 0.003184020286425948, + "skip_count": 2.0, + "step": 7346, + "text_loss": 0.5709269642829895 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00023931174421829376, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 11851437.0, + "repeat_count": 2.0, + "routers_loss": 0.006582654081285, + "skip_count": 4.0, + "step": 7348, + "text_loss": 0.3547070026397705 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00023904767546089318, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 11854161.0, + "repeat_count": 1.0, + "routers_loss": 0.0022124287206679583, + "skip_count": 0.0, + "step": 7350, + "text_loss": 0.6984702348709106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023878370670091565, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11856811.0, + "repeat_count": 1.0, + "routers_loss": 0.0029868825804442167, + "skip_count": 0.0, + "step": 7352, + "text_loss": 0.25389090180397034 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.00023851983803951444, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 11860110.0, + "repeat_count": 0.0, + "routers_loss": 0.0028468978125602007, + "skip_count": 1.0, + "step": 7354, + "text_loss": 0.5729252099990845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00023825606957780454, + "loss": 0.0041, + "macro_f1": 1.0, + "num_tokens": 11863058.0, + "repeat_count": 1.0, + "routers_loss": 0.003115740604698658, + "skip_count": 2.0, + "step": 7356, + "text_loss": 0.60753333568573 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00023799240141686258, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 11865865.0, + "repeat_count": 0.0, + "routers_loss": 0.0022254586219787598, + "skip_count": 0.0, + "step": 7358, + "text_loss": 0.2568866014480591 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00023772883365772658, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 11869133.0, + "repeat_count": 0.0, + "routers_loss": 0.0017388637643307447, + "skip_count": 0.0, + "step": 7360, + "text_loss": 0.7657097578048706 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00023746536640139633, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11872988.0, + "repeat_count": 0.0, + "routers_loss": 0.002158832037821412, + "skip_count": 0.0, + "step": 7362, + "text_loss": 0.19717472791671753 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00023720199974883294, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11875810.0, + "repeat_count": 0.0, + "routers_loss": 0.001037398586049676, + "skip_count": 0.0, + "step": 7364, + "text_loss": 0.47334593534469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00023693873380095876, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 11878558.0, + "repeat_count": 0.0, + "routers_loss": 0.011853457428514957, + "skip_count": 5.0, + "step": 7366, + "text_loss": 0.2567826211452484 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00023667556865865824, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 11881473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015339091187343001, + "skip_count": 0.0, + "step": 7368, + "text_loss": 0.40981143712997437 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00023641250442277655, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 11885033.0, + "repeat_count": 1.0, + "routers_loss": 0.010062574408948421, + "skip_count": 0.0, + "step": 7370, + "text_loss": 0.3183043301105499 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.00023614954119412042, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 11889136.0, + "repeat_count": 0.0, + "routers_loss": 0.0010769609361886978, + "skip_count": 0.0, + "step": 7372, + "text_loss": 0.5279555916786194 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00023588667907345785, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11893102.0, + "repeat_count": 0.0, + "routers_loss": 0.0032862431835383177, + "skip_count": 3.0, + "step": 7374, + "text_loss": 0.5425930023193359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 34.629292632814796, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.0341796875, + "learning_rate": 0.00023562391816151808, + "loss": 0.0057, + "macro_f1": 0.5934640765190125, + "num_tokens": 11895841.0, + "repeat_count": 0.0, + "routers_loss": 0.02405562624335289, + "skip_count": 3.0, + "step": 7376, + "text_loss": 0.26054954528808594 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00023536125855899153, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 11899594.0, + "repeat_count": 1.0, + "routers_loss": 0.008315852843225002, + "skip_count": 3.0, + "step": 7378, + "text_loss": 0.19068174064159393 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 34.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00023509870036652998, + "loss": 0.0065, + "macro_f1": 1.0, + "num_tokens": 11902843.0, + "repeat_count": 1.0, + "routers_loss": 0.006180883850902319, + "skip_count": 4.0, + "step": 7380, + "text_loss": 0.18461982905864716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00023483624368474614, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 11905786.0, + "repeat_count": 0.0, + "routers_loss": 0.0008856299100443721, + "skip_count": 0.0, + "step": 7382, + "text_loss": 0.5216618180274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.66686234223657, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00023457388861421397, + "loss": 0.0059, + "macro_f1": 0.32098764181137085, + "num_tokens": 11908706.0, + "repeat_count": 1.0, + "routers_loss": 0.04762765392661095, + "skip_count": 1.0, + "step": 7384, + "text_loss": 0.25329193472862244 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00023431163525546833, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 11911862.0, + "repeat_count": 1.0, + "routers_loss": 0.000989250373095274, + "skip_count": 1.0, + "step": 7386, + "text_loss": 0.2657507658004761 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01708984375, + "learning_rate": 0.0002340494837090053, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 11915483.0, + "repeat_count": 0.0, + "routers_loss": 0.0008857969660311937, + "skip_count": 0.0, + "step": 7388, + "text_loss": 0.5136669874191284 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00023378743407528164, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 11918778.0, + "repeat_count": 0.0, + "routers_loss": 0.0041572838090360165, + "skip_count": 1.0, + "step": 7390, + "text_loss": 0.5212553143501282 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00023352548645471556, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11921916.0, + "repeat_count": 0.0, + "routers_loss": 0.0010537431808188558, + "skip_count": 0.0, + "step": 7392, + "text_loss": 0.48122525215148926 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00023326364094768576, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 11924273.0, + "repeat_count": 1.0, + "routers_loss": 0.004077036865055561, + "skip_count": 0.0, + "step": 7394, + "text_loss": 0.2128690630197525 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00023300189765453194, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 11927424.0, + "repeat_count": 0.0, + "routers_loss": 0.005371362902224064, + "skip_count": 2.0, + "step": 7396, + "text_loss": 0.19448284804821014 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00023274025667555464, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 11930919.0, + "repeat_count": 0.0, + "routers_loss": 0.002137752715498209, + "skip_count": 0.0, + "step": 7398, + "text_loss": 0.7537064552307129 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.06640625, + "learning_rate": 0.00023247871811101512, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 11933680.0, + "repeat_count": 0.0, + "routers_loss": 0.0002398790093138814, + "skip_count": 0.0, + "step": 7400, + "text_loss": 0.5589297413825989 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.751394188435576, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 0.00023221728206113546, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 11937090.0, + "repeat_count": 0.0, + "routers_loss": 0.019718777388334274, + "skip_count": 1.0, + "step": 7402, + "text_loss": 0.8014751672744751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0002319559486260985, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 11940581.0, + "repeat_count": 0.0, + "routers_loss": 0.001230534864589572, + "skip_count": 0.0, + "step": 7404, + "text_loss": 0.5218383073806763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.0002316947179060477, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 11943832.0, + "repeat_count": 0.0, + "routers_loss": 0.0016393321566283703, + "skip_count": 0.0, + "step": 7406, + "text_loss": 0.17122556269168854 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023143359000108704, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 11947025.0, + "repeat_count": 0.0, + "routers_loss": 0.005269679240882397, + "skip_count": 2.0, + "step": 7408, + "text_loss": 0.2015499323606491 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 34.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00023117256501128136, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 11950077.0, + "repeat_count": 1.0, + "routers_loss": 0.005140089895576239, + "skip_count": 2.0, + "step": 7410, + "text_loss": 0.39068636298179626 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00023091164303665592, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 11953800.0, + "repeat_count": 0.0, + "routers_loss": 0.005578748416155577, + "skip_count": 0.0, + "step": 7412, + "text_loss": 0.18851874768733978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.068359375, + "learning_rate": 0.00023065082417719624, + "loss": 0.008, + "macro_f1": 0.3333333432674408, + "num_tokens": 11956383.0, + "repeat_count": 0.0, + "routers_loss": 0.0006410991190932691, + "skip_count": 0.0, + "step": 7414, + "text_loss": 0.5663703083992004 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 34.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0002303901085328491, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 11959554.0, + "repeat_count": 0.0, + "routers_loss": 0.0005902954144403338, + "skip_count": 5.0, + "step": 7416, + "text_loss": 0.5225661993026733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0002301294962035209, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 11962582.0, + "repeat_count": 0.0, + "routers_loss": 0.00045644037891179323, + "skip_count": 0.0, + "step": 7418, + "text_loss": 0.40572360157966614 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 0.0002298689872890789, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 11965649.0, + "repeat_count": 0.0, + "routers_loss": 0.01017778366804123, + "skip_count": 2.0, + "step": 7420, + "text_loss": 0.12190715968608856 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00022960858188935052, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 11968850.0, + "repeat_count": 0.0, + "routers_loss": 0.0008010792662389576, + "skip_count": 0.0, + "step": 7422, + "text_loss": 0.5606820583343506 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0002293482801041236, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 11972064.0, + "repeat_count": 0.0, + "routers_loss": 0.001889281440526247, + "skip_count": 0.0, + "step": 7424, + "text_loss": 0.44142210483551025 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00022908808203314635, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 11975466.0, + "repeat_count": 0.0, + "routers_loss": 0.00647713290527463, + "skip_count": 2.0, + "step": 7426, + "text_loss": 0.23273423314094543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.0002288279877761271, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 11979875.0, + "repeat_count": 0.0, + "routers_loss": 0.004027119372040033, + "skip_count": 0.0, + "step": 7428, + "text_loss": 0.5608086585998535 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.0002285679974327345, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 11982808.0, + "repeat_count": 0.0, + "routers_loss": 0.0009015435934998095, + "skip_count": 0.0, + "step": 7430, + "text_loss": 0.3976539373397827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002283081111025973, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 11985978.0, + "repeat_count": 0.0, + "routers_loss": 0.00047143330448307097, + "skip_count": 0.0, + "step": 7432, + "text_loss": 0.4280148446559906 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022804832888530447, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 11988925.0, + "repeat_count": 0.0, + "routers_loss": 0.0004895820748060942, + "skip_count": 0.0, + "step": 7434, + "text_loss": 0.5137463808059692 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000227788650880405, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 11991631.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349024574272335, + "skip_count": 0.0, + "step": 7436, + "text_loss": 0.4306720197200775 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00022752907718740807, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 11995476.0, + "repeat_count": 0.0, + "routers_loss": 0.0038723985198885202, + "skip_count": 0.0, + "step": 7438, + "text_loss": 0.6413722038269043 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 34.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.043701171875, + "learning_rate": 0.00022726960790578248, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 11998846.0, + "repeat_count": 1.0, + "routers_loss": 0.004433541093021631, + "skip_count": 0.0, + "step": 7440, + "text_loss": 0.6424159407615662 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.8333333134651184, + "avg_layers": 23.0, + "epoch": 34.93924273554447, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.0, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.036376953125, + "learning_rate": 0.0002270102431349579, + "loss": 0.0062, + "macro_f1": 0.6289562582969666, + "num_tokens": 12002228.0, + "repeat_count": 0.0, + "routers_loss": 0.023979803547263145, + "skip_count": 6.0, + "step": 7442, + "text_loss": 0.16657918691635132 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 34.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022675098297432307, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12005003.0, + "repeat_count": 3.0, + "routers_loss": 0.005645833443850279, + "skip_count": 1.0, + "step": 7444, + "text_loss": 0.6388722658157349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00022649182752322705, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12007657.0, + "repeat_count": 0.0, + "routers_loss": 0.001629356062039733, + "skip_count": 2.0, + "step": 7446, + "text_loss": 0.35670006275177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 34.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00022623277688097864, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12010652.0, + "repeat_count": 0.0, + "routers_loss": 0.006375396624207497, + "skip_count": 2.0, + "step": 7448, + "text_loss": 0.24273613095283508 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0002259738311468466, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12014042.0, + "repeat_count": 0.0, + "routers_loss": 0.003734540194272995, + "skip_count": 0.0, + "step": 7450, + "text_loss": 0.4262580871582031 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 34.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.0002257149904200592, + "loss": 0.0076, + "macro_f1": 1.0, + "num_tokens": 12016987.0, + "repeat_count": 1.0, + "routers_loss": 0.0027926203329116106, + "skip_count": 1.0, + "step": 7452, + "text_loss": 0.366216778755188 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 34.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00022545625479980508, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 12021584.0, + "repeat_count": 0.0, + "routers_loss": 0.0008985420572571456, + "skip_count": 0.0, + "step": 7454, + "text_loss": 0.533937394618988 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00022519762438523205, + "loss": 0.0029, + "macro_f1": 0.6666666865348816, + "num_tokens": 12024142.0, + "repeat_count": 0.0, + "routers_loss": 0.005394646432250738, + "skip_count": 1.0, + "step": 7456, + "text_loss": 0.2401239275932312 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0002249390992754477, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12027262.0, + "repeat_count": 0.0, + "routers_loss": 0.00275063537992537, + "skip_count": 0.0, + "step": 7458, + "text_loss": 0.21824975311756134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00022468067956951944, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12030528.0, + "repeat_count": 0.0, + "routers_loss": 0.0008951274212449789, + "skip_count": 1.0, + "step": 7460, + "text_loss": 0.610903263092041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00022442236536647408, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12033699.0, + "repeat_count": 0.0, + "routers_loss": 0.004062872380018234, + "skip_count": 2.0, + "step": 7462, + "text_loss": 0.26921433210372925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00022416415676529823, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12037402.0, + "repeat_count": 0.0, + "routers_loss": 0.0023089025635272264, + "skip_count": 1.0, + "step": 7464, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00022390605386493756, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12041129.0, + "repeat_count": 0.0, + "routers_loss": 0.0021355501376092434, + "skip_count": 2.0, + "step": 7466, + "text_loss": 0.4265538454055786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00022364805676429816, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 12044356.0, + "repeat_count": 0.0, + "routers_loss": 0.0061582159250974655, + "skip_count": 1.0, + "step": 7468, + "text_loss": 0.12020833045244217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00022339016556224467, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 12047158.0, + "repeat_count": 0.0, + "routers_loss": 0.003753372235223651, + "skip_count": 1.0, + "step": 7470, + "text_loss": 0.6406939625740051 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00022313238035760158, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12050149.0, + "repeat_count": 1.0, + "routers_loss": 0.005371729377657175, + "skip_count": 5.0, + "step": 7472, + "text_loss": 0.5184400677680969 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002228747012491526, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12053560.0, + "repeat_count": 0.0, + "routers_loss": 0.000824139395263046, + "skip_count": 0.0, + "step": 7474, + "text_loss": 0.32644152641296387 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002226171283356409, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12056309.0, + "repeat_count": 0.0, + "routers_loss": 0.0044801668263971806, + "skip_count": 1.0, + "step": 7476, + "text_loss": 0.7027081847190857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00022235966171576887, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12059191.0, + "repeat_count": 0.0, + "routers_loss": 0.007496353704482317, + "skip_count": 2.0, + "step": 7478, + "text_loss": 0.28705671429634094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0002221023014881982, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12062365.0, + "repeat_count": 0.0, + "routers_loss": 0.0018641395727172494, + "skip_count": 1.0, + "step": 7480, + "text_loss": 0.715477466583252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00022184504775154984, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12065508.0, + "repeat_count": 0.0, + "routers_loss": 0.0005825075786560774, + "skip_count": 0.0, + "step": 7482, + "text_loss": 0.7481293678283691 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00022158790060440394, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12068043.0, + "repeat_count": 0.0, + "routers_loss": 0.0028906071092933416, + "skip_count": 0.0, + "step": 7484, + "text_loss": 0.6151962876319885 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00022133086014529968, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12070897.0, + "repeat_count": 0.0, + "routers_loss": 0.0030862605199217796, + "skip_count": 1.0, + "step": 7486, + "text_loss": 0.4923575222492218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00022107392647273527, + "loss": 0.009, + "macro_f1": 0.3333333432674408, + "num_tokens": 12074644.0, + "repeat_count": 0.0, + "routers_loss": 0.0011101154377683997, + "skip_count": 0.0, + "step": 7488, + "text_loss": 0.5217859148979187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00022081709968516867, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12077718.0, + "repeat_count": 0.0, + "routers_loss": 0.004303969442844391, + "skip_count": 0.0, + "step": 7490, + "text_loss": 0.18933317065238953 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00022056037988101612, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12080509.0, + "repeat_count": 0.0, + "routers_loss": 0.0019941304344683886, + "skip_count": 1.0, + "step": 7492, + "text_loss": 0.6760565042495728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 0.00022030376715865313, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12083580.0, + "repeat_count": 0.0, + "routers_loss": 0.0017090907786041498, + "skip_count": 0.0, + "step": 7494, + "text_loss": 0.4140956401824951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.0002200472616164142, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12086923.0, + "repeat_count": 0.0, + "routers_loss": 0.005131757352501154, + "skip_count": 1.0, + "step": 7496, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00021979086335259269, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12090003.0, + "repeat_count": 0.0, + "routers_loss": 0.0007472267607226968, + "skip_count": 0.0, + "step": 7498, + "text_loss": 0.6692602038383484 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021953457246544095, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12092936.0, + "repeat_count": 0.0, + "routers_loss": 0.0012374494690448046, + "skip_count": 0.0, + "step": 7500, + "text_loss": 0.5170100331306458 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00021927838905317016, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12096395.0, + "repeat_count": 0.0, + "routers_loss": 0.006784295197576284, + "skip_count": 2.0, + "step": 7502, + "text_loss": 0.340880811214447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 0.00021902231321395017, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12099743.0, + "repeat_count": 0.0, + "routers_loss": 0.0058755455538630486, + "skip_count": 1.0, + "step": 7504, + "text_loss": 0.5299809575080872 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021876634504590985, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12103121.0, + "repeat_count": 0.0, + "routers_loss": 0.010622406378388405, + "skip_count": 2.0, + "step": 7506, + "text_loss": 0.1817338913679123 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00021851048464713662, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12105883.0, + "repeat_count": 0.0, + "routers_loss": 0.004382388666272163, + "skip_count": 3.0, + "step": 7508, + "text_loss": 0.5718557834625244 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00021825473211567665, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12108936.0, + "repeat_count": 0.0, + "routers_loss": 0.001638208981603384, + "skip_count": 0.0, + "step": 7510, + "text_loss": 0.4684678316116333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 0.00021799908754953468, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12112060.0, + "repeat_count": 0.0, + "routers_loss": 0.0007894381997175515, + "skip_count": 2.0, + "step": 7512, + "text_loss": 0.5146099328994751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00021774355104667455, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12115636.0, + "repeat_count": 0.0, + "routers_loss": 0.01400370616465807, + "skip_count": 2.0, + "step": 7514, + "text_loss": 0.19512294232845306 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00021748812270501805, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12119116.0, + "repeat_count": 0.0, + "routers_loss": 0.005261222366243601, + "skip_count": 3.0, + "step": 7516, + "text_loss": 0.17316904664039612 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0002172328026224459, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12122070.0, + "repeat_count": 0.0, + "routers_loss": 0.01021486520767212, + "skip_count": 2.0, + "step": 7518, + "text_loss": 0.2777172029018402 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00021697759089679713, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 12125386.0, + "repeat_count": 2.0, + "routers_loss": 0.005217147525399923, + "skip_count": 2.0, + "step": 7520, + "text_loss": 0.49744322896003723 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00021672248762586948, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12128753.0, + "repeat_count": 0.0, + "routers_loss": 0.003868246916681528, + "skip_count": 0.0, + "step": 7522, + "text_loss": 0.4209211468696594 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.32403874376284, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00021646749290741895, + "loss": 0.009, + "macro_f1": 0.6598639488220215, + "num_tokens": 12132425.0, + "repeat_count": 1.0, + "routers_loss": 0.044205982238054276, + "skip_count": 3.0, + "step": 7524, + "text_loss": 0.4180344343185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00021621260683916005, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12135740.0, + "repeat_count": 0.0, + "routers_loss": 0.0032584366854280233, + "skip_count": 2.0, + "step": 7526, + "text_loss": 0.21219655871391296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00021595782951876552, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12139239.0, + "repeat_count": 0.0, + "routers_loss": 0.002418758114799857, + "skip_count": 2.0, + "step": 7528, + "text_loss": 0.40800613164901733 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0002157031610438665, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12142572.0, + "repeat_count": 1.0, + "routers_loss": 0.005265383515506983, + "skip_count": 1.0, + "step": 7530, + "text_loss": 0.7539705634117126 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03515625, + "learning_rate": 0.0002154486015120525, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 12145737.0, + "repeat_count": 1.0, + "routers_loss": 0.006648020353168249, + "skip_count": 2.0, + "step": 7532, + "text_loss": 0.7824432253837585 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.371000880540066, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0002151941510208712, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 12149376.0, + "repeat_count": 1.0, + "routers_loss": 0.01692759431898594, + "skip_count": 0.0, + "step": 7534, + "text_loss": 0.4476291239261627 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0002149398096678283, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12152191.0, + "repeat_count": 1.0, + "routers_loss": 0.013883143663406372, + "skip_count": 0.0, + "step": 7536, + "text_loss": 0.14996720850467682 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021468557755038826, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12155084.0, + "repeat_count": 2.0, + "routers_loss": 0.009390740655362606, + "skip_count": 2.0, + "step": 7538, + "text_loss": 0.23685340583324432 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0002144314547659731, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12159366.0, + "repeat_count": 0.0, + "routers_loss": 0.0025363171007484198, + "skip_count": 0.0, + "step": 7540, + "text_loss": 0.6687407493591309 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021417744141196315, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12162545.0, + "repeat_count": 0.0, + "routers_loss": 0.004230613354593515, + "skip_count": 1.0, + "step": 7542, + "text_loss": 0.24885894358158112 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00021392353758569694, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12165381.0, + "repeat_count": 1.0, + "routers_loss": 0.008058524690568447, + "skip_count": 0.0, + "step": 7544, + "text_loss": 0.15833988785743713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0002136697433844707, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 12168304.0, + "repeat_count": 0.0, + "routers_loss": 0.0018041770672425628, + "skip_count": 0.0, + "step": 7546, + "text_loss": 0.6046217083930969 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00021341605890553894, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12171040.0, + "repeat_count": 1.0, + "routers_loss": 0.008584463968873024, + "skip_count": 2.0, + "step": 7548, + "text_loss": 0.3001522719860077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00021316248424611408, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12174702.0, + "repeat_count": 0.0, + "routers_loss": 0.0010506469989195466, + "skip_count": 0.0, + "step": 7550, + "text_loss": 0.2998376488685608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 0.00021290901950336627, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12178388.0, + "repeat_count": 0.0, + "routers_loss": 0.0012753128539770842, + "skip_count": 0.0, + "step": 7552, + "text_loss": 0.8125656843185425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00021265566477442384, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12181863.0, + "repeat_count": 0.0, + "routers_loss": 0.004343052394688129, + "skip_count": 2.0, + "step": 7554, + "text_loss": 0.14004671573638916 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00021240242015637268, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12185485.0, + "repeat_count": 1.0, + "routers_loss": 0.0005794052849523723, + "skip_count": 0.0, + "step": 7556, + "text_loss": 0.7116519808769226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.4837100088054, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.00021214928574625664, + "loss": 0.0063, + "macro_f1": 0.3272727429866791, + "num_tokens": 12188914.0, + "repeat_count": 1.0, + "routers_loss": 0.01066325418651104, + "skip_count": 0.0, + "step": 7558, + "text_loss": 0.4664429724216461 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00021189626164107718, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12193042.0, + "repeat_count": 0.0, + "routers_loss": 0.0011769415577873588, + "skip_count": 0.0, + "step": 7560, + "text_loss": 0.672637403011322 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00021164334793779388, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12195675.0, + "repeat_count": 1.0, + "routers_loss": 0.008653911761939526, + "skip_count": 1.0, + "step": 7562, + "text_loss": 0.5301182866096497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00021139054473332357, + "loss": 0.0065, + "macro_f1": 0.3333333432674408, + "num_tokens": 12198638.0, + "repeat_count": 0.0, + "routers_loss": 0.0058176578022539616, + "skip_count": 0.0, + "step": 7564, + "text_loss": 0.1889677792787552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.000211137852124541, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 12202312.0, + "repeat_count": 0.0, + "routers_loss": 0.0004154018242843449, + "skip_count": 0.0, + "step": 7566, + "text_loss": 0.3610386848449707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00021088527020827848, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12205112.0, + "repeat_count": 0.0, + "routers_loss": 0.0014722816413268447, + "skip_count": 0.0, + "step": 7568, + "text_loss": 0.15214823186397552 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.0002106327990813257, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12208103.0, + "repeat_count": 0.0, + "routers_loss": 0.0015596678713336587, + "skip_count": 0.0, + "step": 7570, + "text_loss": 0.5034125447273254 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00021038043884043022, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12211208.0, + "repeat_count": 1.0, + "routers_loss": 0.007482443004846573, + "skip_count": 0.0, + "step": 7572, + "text_loss": 0.6760116219520569 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00021012818958229696, + "loss": 0.0031, + "macro_f1": 0.6666666865348816, + "num_tokens": 12214463.0, + "repeat_count": 0.0, + "routers_loss": 0.003875598544254899, + "skip_count": 2.0, + "step": 7574, + "text_loss": 0.3278147876262665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00020987605140358824, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12218199.0, + "repeat_count": 0.0, + "routers_loss": 0.007918627932667732, + "skip_count": 2.0, + "step": 7576, + "text_loss": 0.23850615322589874 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00020962402440092388, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12221151.0, + "repeat_count": 0.0, + "routers_loss": 0.005424308590590954, + "skip_count": 1.0, + "step": 7578, + "text_loss": 0.5670642256736755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0002093721086708812, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12224789.0, + "repeat_count": 1.0, + "routers_loss": 0.0066504343412816525, + "skip_count": 1.0, + "step": 7580, + "text_loss": 0.30404478311538696 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00020912030430999452, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12228134.0, + "repeat_count": 1.0, + "routers_loss": 0.008815597742795944, + "skip_count": 0.0, + "step": 7582, + "text_loss": 0.32522889971733093 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.60581156442618, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.05126953125, + "learning_rate": 0.0002088686114147561, + "loss": 0.0098, + "macro_f1": 0.5492662787437439, + "num_tokens": 12231335.0, + "repeat_count": 0.0, + "routers_loss": 0.03785836696624756, + "skip_count": 2.0, + "step": 7584, + "text_loss": 0.6277920603752136 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00020861703008161504, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12234619.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183801926672459, + "skip_count": 0.0, + "step": 7586, + "text_loss": 0.38319316506385803 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00020836556040697767, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12237296.0, + "repeat_count": 1.0, + "routers_loss": 0.013077575713396072, + "skip_count": 1.0, + "step": 7588, + "text_loss": 0.297571063041687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00020811420248720769, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12240633.0, + "repeat_count": 0.0, + "routers_loss": 0.002858756808564067, + "skip_count": 0.0, + "step": 7590, + "text_loss": 0.2506035268306732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.000207862956418626, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12244118.0, + "repeat_count": 0.0, + "routers_loss": 0.0032624071463942528, + "skip_count": 1.0, + "step": 7592, + "text_loss": 0.19843827188014984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.056640625, + "learning_rate": 0.00020761182229751045, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 12247367.0, + "repeat_count": 1.0, + "routers_loss": 0.005885142367333174, + "skip_count": 3.0, + "step": 7594, + "text_loss": 0.3347153067588806 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 35.66216612855885, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020736080022009602, + "loss": 0.0088, + "macro_f1": 0.9452888369560242, + "num_tokens": 12250487.0, + "repeat_count": 1.0, + "routers_loss": 0.021491389721632004, + "skip_count": 4.0, + "step": 7596, + "text_loss": 0.6777212619781494 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 35.671558555914295, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04052734375, + "learning_rate": 0.00020710989028257514, + "loss": 0.0061, + "macro_f1": 0.6595745086669922, + "num_tokens": 12253834.0, + "repeat_count": 1.0, + "routers_loss": 0.014164486899971962, + "skip_count": 4.0, + "step": 7598, + "text_loss": 0.741127610206604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0002068590925810968, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12257289.0, + "repeat_count": 0.0, + "routers_loss": 0.0012773120542988181, + "skip_count": 0.0, + "step": 7600, + "text_loss": 0.5336982607841492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.0002066084072117672, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 12260825.0, + "repeat_count": 0.0, + "routers_loss": 0.013102042488753796, + "skip_count": 2.0, + "step": 7602, + "text_loss": 0.30410775542259216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00020635783427064942, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12264609.0, + "repeat_count": 0.0, + "routers_loss": 0.002602101070806384, + "skip_count": 0.0, + "step": 7604, + "text_loss": 0.29835572838783264 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020610737385376348, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12267537.0, + "repeat_count": 0.0, + "routers_loss": 0.0053265830501914024, + "skip_count": 0.0, + "step": 7606, + "text_loss": 0.2095658779144287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00020585702605708628, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12271175.0, + "repeat_count": 0.0, + "routers_loss": 0.000614096992649138, + "skip_count": 0.0, + "step": 7608, + "text_loss": 0.8146751523017883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00020560679097655137, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12274067.0, + "repeat_count": 0.0, + "routers_loss": 0.0013201923575252295, + "skip_count": 0.0, + "step": 7610, + "text_loss": 0.40818271040916443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002053566687080497, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 12276946.0, + "repeat_count": 0.0, + "routers_loss": 0.004304401110857725, + "skip_count": 1.0, + "step": 7612, + "text_loss": 0.7063660025596619 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0002051066593474284, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 12279760.0, + "repeat_count": 0.0, + "routers_loss": 0.0032060579396784306, + "skip_count": 1.0, + "step": 7614, + "text_loss": 0.23671887814998627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00020485676299049154, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12282737.0, + "repeat_count": 0.0, + "routers_loss": 0.005103024188429117, + "skip_count": 2.0, + "step": 7616, + "text_loss": 0.17571020126342773 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00020460697973299986, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 12286290.0, + "repeat_count": 1.0, + "routers_loss": 0.007189507596194744, + "skip_count": 1.0, + "step": 7618, + "text_loss": 0.30872994661331177 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0002043573096706708, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 12289458.0, + "repeat_count": 0.0, + "routers_loss": 0.0010217712260782719, + "skip_count": 0.0, + "step": 7620, + "text_loss": 0.5155487060546875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.0002041077528991784, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 12292846.0, + "repeat_count": 0.0, + "routers_loss": 0.0022399788722395897, + "skip_count": 1.0, + "step": 7622, + "text_loss": 0.717949390411377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0002038583095141532, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12295673.0, + "repeat_count": 0.0, + "routers_loss": 0.0018168877577409148, + "skip_count": 0.0, + "step": 7624, + "text_loss": 0.560361385345459 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0279541015625, + "learning_rate": 0.00020360897961118246, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12298624.0, + "repeat_count": 0.0, + "routers_loss": 0.0008487844606861472, + "skip_count": 0.0, + "step": 7626, + "text_loss": 0.6391524076461792 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00020335976328580984, + "loss": 0.0083, + "macro_f1": 0.3333333432674408, + "num_tokens": 12302136.0, + "repeat_count": 0.0, + "routers_loss": 0.0006127831293269992, + "skip_count": 0.0, + "step": 7628, + "text_loss": 0.5932226777076721 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.07373046875, + "learning_rate": 0.00020311066063353556, + "loss": 0.0085, + "macro_f1": 0.3333333432674408, + "num_tokens": 12305152.0, + "repeat_count": 0.0, + "routers_loss": 0.0018765819258987904, + "skip_count": 0.0, + "step": 7630, + "text_loss": 0.37831631302833557 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00020286167174981618, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12307771.0, + "repeat_count": 0.0, + "routers_loss": 0.0025384656619280577, + "skip_count": 0.0, + "step": 7632, + "text_loss": 0.34806445240974426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0002026127967300645, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12310921.0, + "repeat_count": 0.0, + "routers_loss": 0.008239032700657845, + "skip_count": 2.0, + "step": 7634, + "text_loss": 0.34859901666641235 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00020236403566965027, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12314200.0, + "repeat_count": 0.0, + "routers_loss": 0.0029505928978323936, + "skip_count": 2.0, + "step": 7636, + "text_loss": 0.2647531032562256 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 35.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.0002021153886638991, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12319221.0, + "repeat_count": 1.0, + "routers_loss": 0.0014016951899975538, + "skip_count": 0.0, + "step": 7638, + "text_loss": 0.42428603768348694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 35.86879953037863, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.04248046875, + "learning_rate": 0.00020186685580809288, + "loss": 0.0059, + "macro_f1": 0.5492662787437439, + "num_tokens": 12322204.0, + "repeat_count": 0.0, + "routers_loss": 0.01761031709611416, + "skip_count": 2.0, + "step": 7640, + "text_loss": 0.25929757952690125 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00020161843719746997, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12324750.0, + "repeat_count": 0.0, + "routers_loss": 0.0023674629628658295, + "skip_count": 0.0, + "step": 7642, + "text_loss": 0.567159116268158 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0002013701329272248, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 12327933.0, + "repeat_count": 0.0, + "routers_loss": 0.004534341394901276, + "skip_count": 0.0, + "step": 7644, + "text_loss": 0.4765215516090393 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00020112194309250797, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12330847.0, + "repeat_count": 0.0, + "routers_loss": 0.003144246758893132, + "skip_count": 2.0, + "step": 7646, + "text_loss": 0.39837369322776794 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00020087386778842642, + "loss": 0.0046, + "macro_f1": 1.0, + "num_tokens": 12333782.0, + "repeat_count": 1.0, + "routers_loss": 0.008137194439768791, + "skip_count": 1.0, + "step": 7648, + "text_loss": 0.42175763845443726 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00020062590711004296, + "loss": 0.0034, + "macro_f1": 1.0, + "num_tokens": 12336837.0, + "repeat_count": 1.0, + "routers_loss": 0.006499455776065588, + "skip_count": 1.0, + "step": 7650, + "text_loss": 0.18695278465747833 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0498046875, + "learning_rate": 0.00020037806115237667, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12340414.0, + "repeat_count": 0.0, + "routers_loss": 0.001548365456983447, + "skip_count": 0.0, + "step": 7652, + "text_loss": 0.1981094628572464 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 35.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00020013033001040255, + "loss": 0.0072, + "macro_f1": 0.6666666865348816, + "num_tokens": 12343209.0, + "repeat_count": 0.0, + "routers_loss": 0.008136926218867302, + "skip_count": 2.0, + "step": 7654, + "text_loss": 0.2231602668762207 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00019988271377905165, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12346158.0, + "repeat_count": 0.0, + "routers_loss": 0.00370375020429492, + "skip_count": 1.0, + "step": 7656, + "text_loss": 0.4809921383857727 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 35.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00019963521255321077, + "loss": 0.0069, + "macro_f1": 0.6666666865348816, + "num_tokens": 12349279.0, + "repeat_count": 0.0, + "routers_loss": 0.00690054427832365, + "skip_count": 3.0, + "step": 7658, + "text_loss": 0.40473970770835876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 35.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.0001993878264277233, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 12352848.0, + "repeat_count": 1.0, + "routers_loss": 0.004367961548268795, + "skip_count": 1.0, + "step": 7660, + "text_loss": 0.3646799921989441 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049560546875, + "learning_rate": 0.00019914055549738775, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12356737.0, + "repeat_count": 0.0, + "routers_loss": 0.000662159756757319, + "skip_count": 0.0, + "step": 7662, + "text_loss": 0.3703214228153229 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 35.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001988933998569589, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12360085.0, + "repeat_count": 0.0, + "routers_loss": 0.0023262565955519676, + "skip_count": 0.0, + "step": 7664, + "text_loss": 0.12910836935043335 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 35.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.0001986463596011473, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12363296.0, + "repeat_count": 0.0, + "routers_loss": 0.002686078194528818, + "skip_count": 1.0, + "step": 7666, + "text_loss": 0.39628392457962036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00019839943482461914, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12366072.0, + "repeat_count": 0.0, + "routers_loss": 0.007100159768015146, + "skip_count": 1.0, + "step": 7668, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00019815262562199648, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12368940.0, + "repeat_count": 0.0, + "routers_loss": 0.004194926470518112, + "skip_count": 0.0, + "step": 7670, + "text_loss": 0.36411619186401367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00019790593208785713, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12372031.0, + "repeat_count": 0.0, + "routers_loss": 0.0041313013061881065, + "skip_count": 0.0, + "step": 7672, + "text_loss": 0.23270413279533386 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019765935431673444, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12375115.0, + "repeat_count": 1.0, + "routers_loss": 0.003343774238601327, + "skip_count": 0.0, + "step": 7674, + "text_loss": 0.1686355322599411 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.75, + "avg_layers": 25.0, + "epoch": 36.03756970942178, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019741289240311755, + "loss": 0.0058, + "macro_f1": 0.6122449040412903, + "num_tokens": 12379089.0, + "repeat_count": 0.0, + "routers_loss": 0.021328814327716827, + "skip_count": 4.0, + "step": 7676, + "text_loss": 0.9312577247619629 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00019716654644145104, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12383115.0, + "repeat_count": 0.0, + "routers_loss": 0.0004511173174250871, + "skip_count": 0.0, + "step": 7678, + "text_loss": 0.3305695056915283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.050048828125, + "learning_rate": 0.00019692031652613522, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12386064.0, + "repeat_count": 0.0, + "routers_loss": 0.006190002430230379, + "skip_count": 0.0, + "step": 7680, + "text_loss": 0.4829687178134918 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00019667420275152575, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 12389743.0, + "repeat_count": 2.0, + "routers_loss": 0.004575030412524939, + "skip_count": 1.0, + "step": 7682, + "text_loss": 0.5751548409461975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.0001964282052119341, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12392481.0, + "repeat_count": 0.0, + "routers_loss": 0.002718796720728278, + "skip_count": 0.0, + "step": 7684, + "text_loss": 0.5349925756454468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001961823240016269, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 12395207.0, + "repeat_count": 0.0, + "routers_loss": 0.0027528523933142424, + "skip_count": 0.0, + "step": 7686, + "text_loss": 0.5322592258453369 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00019593655921482624, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12398232.0, + "repeat_count": 1.0, + "routers_loss": 0.008105970919132233, + "skip_count": 0.0, + "step": 7688, + "text_loss": 0.3192061185836792 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.10331670090989, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019569091094570967, + "loss": 0.0069, + "macro_f1": 0.6603773832321167, + "num_tokens": 12400862.0, + "repeat_count": 1.0, + "routers_loss": 0.024075545370578766, + "skip_count": 1.0, + "step": 7690, + "text_loss": 0.3189752697944641 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.0001954453792884101, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12404039.0, + "repeat_count": 0.0, + "routers_loss": 0.007513802964240313, + "skip_count": 3.0, + "step": 7692, + "text_loss": 0.5985093712806702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001951999643370157, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12407085.0, + "repeat_count": 1.0, + "routers_loss": 0.009606506675481796, + "skip_count": 2.0, + "step": 7694, + "text_loss": 0.2050790935754776 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00019495466618556996, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12411377.0, + "repeat_count": 0.0, + "routers_loss": 0.0007978329667821527, + "skip_count": 0.0, + "step": 7696, + "text_loss": 0.4705570638179779 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019470948492807154, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12414427.0, + "repeat_count": 0.0, + "routers_loss": 0.0010737364646047354, + "skip_count": 0.0, + "step": 7698, + "text_loss": 0.6105324029922485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04296875, + "learning_rate": 0.00019446442065847448, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12417442.0, + "repeat_count": 0.0, + "routers_loss": 0.001762967323884368, + "skip_count": 0.0, + "step": 7700, + "text_loss": 0.5638618469238281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00019421947347068774, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12420862.0, + "repeat_count": 0.0, + "routers_loss": 0.0015798417152836919, + "skip_count": 0.0, + "step": 7702, + "text_loss": 0.1939864307641983 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00019397464345857562, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 12423876.0, + "repeat_count": 0.0, + "routers_loss": 0.005659835878759623, + "skip_count": 1.0, + "step": 7704, + "text_loss": 0.20829300582408905 + }, + { + "acc_repeat": 0.75, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 36.17845611975345, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.8571428656578064, + "f1_skip": 1.0, + "grad_norm": 0.052001953125, + "learning_rate": 0.00019372993071595723, + "loss": 0.0072, + "macro_f1": 0.9449735879898071, + "num_tokens": 12427639.0, + "repeat_count": 4.0, + "routers_loss": 0.018665846437215805, + "skip_count": 2.0, + "step": 7706, + "text_loss": 0.47913849353790283 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00019348533533660727, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 12431520.0, + "repeat_count": 0.0, + "routers_loss": 0.0006690093432553113, + "skip_count": 0.0, + "step": 7708, + "text_loss": 0.494870662689209 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00019324085741425511, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 12434213.0, + "repeat_count": 0.0, + "routers_loss": 0.004067352041602135, + "skip_count": 1.0, + "step": 7710, + "text_loss": 0.7631711959838867 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 0.00019299649704258504, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12437437.0, + "repeat_count": 2.0, + "routers_loss": 0.01157623715698719, + "skip_count": 0.0, + "step": 7712, + "text_loss": 0.3145926296710968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04345703125, + "learning_rate": 0.0001927522543152364, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12440507.0, + "repeat_count": 0.0, + "routers_loss": 0.001888492377474904, + "skip_count": 0.0, + "step": 7714, + "text_loss": 0.576301097869873 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019250812932580352, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 12443484.0, + "repeat_count": 0.0, + "routers_loss": 0.00042988534551113844, + "skip_count": 0.0, + "step": 7716, + "text_loss": 0.5716445446014404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00019226412216783557, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12446460.0, + "repeat_count": 0.0, + "routers_loss": 0.005063199903815985, + "skip_count": 1.0, + "step": 7718, + "text_loss": 0.2700924873352051 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0001920202329348365, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 12449346.0, + "repeat_count": 0.0, + "routers_loss": 0.0010775640839710832, + "skip_count": 0.0, + "step": 7720, + "text_loss": 0.5162558555603027 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00019177646172026513, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12452680.0, + "repeat_count": 0.0, + "routers_loss": 0.0014514096546918154, + "skip_count": 0.0, + "step": 7722, + "text_loss": 0.5753642916679382 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00019153280861753497, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12455348.0, + "repeat_count": 0.0, + "routers_loss": 0.002202774863690138, + "skip_count": 1.0, + "step": 7724, + "text_loss": 0.5751997232437134 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00019128927372001454, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 12458098.0, + "repeat_count": 0.0, + "routers_loss": 0.005171069409698248, + "skip_count": 0.0, + "step": 7726, + "text_loss": 0.22252975404262543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00019104585712102678, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12460958.0, + "repeat_count": 0.0, + "routers_loss": 0.0041033923625946045, + "skip_count": 0.0, + "step": 7728, + "text_loss": 0.18611937761306763 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00019080255891384945, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12463596.0, + "repeat_count": 1.0, + "routers_loss": 0.0012201941572129726, + "skip_count": 0.0, + "step": 7730, + "text_loss": 0.47347909212112427 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001905593791917148, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 12467021.0, + "repeat_count": 2.0, + "routers_loss": 0.005837214644998312, + "skip_count": 2.0, + "step": 7732, + "text_loss": 0.2055564969778061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.00019031631804780974, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12469743.0, + "repeat_count": 0.0, + "routers_loss": 0.0010269953636452556, + "skip_count": 0.0, + "step": 7734, + "text_loss": 0.45995602011680603 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00019007337557527582, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12473082.0, + "repeat_count": 0.0, + "routers_loss": 0.00436213007196784, + "skip_count": 1.0, + "step": 7736, + "text_loss": 0.4515823721885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00018983055186720888, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 12476100.0, + "repeat_count": 0.0, + "routers_loss": 0.003051829058676958, + "skip_count": 2.0, + "step": 7738, + "text_loss": 0.12298467755317688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.0001895878470166597, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 12480231.0, + "repeat_count": 0.0, + "routers_loss": 0.008164191618561745, + "skip_count": 2.0, + "step": 7740, + "text_loss": 0.17456457018852234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.347519812151454, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.046630859375, + "learning_rate": 0.00018934526111663314, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12483894.0, + "repeat_count": 0.0, + "routers_loss": 0.008653721772134304, + "skip_count": 1.0, + "step": 7742, + "text_loss": 0.7125775814056396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.00018910279426008857, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12488077.0, + "repeat_count": 0.0, + "routers_loss": 0.005024447571486235, + "skip_count": 6.0, + "step": 7744, + "text_loss": 0.833778977394104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.00018886044653993966, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 12490999.0, + "repeat_count": 0.0, + "routers_loss": 0.002690888475626707, + "skip_count": 0.0, + "step": 7746, + "text_loss": 0.15594039857387543 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00018861821804905466, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12494765.0, + "repeat_count": 0.0, + "routers_loss": 0.006087568122893572, + "skip_count": 0.0, + "step": 7748, + "text_loss": 0.2696777880191803 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00018837610888025586, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12497741.0, + "repeat_count": 0.0, + "routers_loss": 0.0014629303477704525, + "skip_count": 0.0, + "step": 7750, + "text_loss": 0.6801294684410095 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.11865234375, + "learning_rate": 0.00018813411912631996, + "loss": 0.0073, + "macro_f1": 0.3333333432674408, + "num_tokens": 12500585.0, + "repeat_count": 0.0, + "routers_loss": 0.001163579523563385, + "skip_count": 0.0, + "step": 7752, + "text_loss": 0.41069695353507996 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00018789224887997796, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12503579.0, + "repeat_count": 2.0, + "routers_loss": 0.009436148218810558, + "skip_count": 0.0, + "step": 7754, + "text_loss": 0.6993107795715332 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00018765049823391472, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 12506698.0, + "repeat_count": 1.0, + "routers_loss": 0.002098206663504243, + "skip_count": 2.0, + "step": 7756, + "text_loss": 0.5704247951507568 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018740886728077, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12509869.0, + "repeat_count": 0.0, + "routers_loss": 0.002066673245280981, + "skip_count": 1.0, + "step": 7758, + "text_loss": 0.7605635523796082 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00018716735611313707, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 12513433.0, + "repeat_count": 0.0, + "routers_loss": 0.0023439819924533367, + "skip_count": 1.0, + "step": 7760, + "text_loss": 0.4746153950691223 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.441444085705896, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.00018692596482356333, + "loss": 0.0057, + "macro_f1": 0.9255813956260681, + "num_tokens": 12516817.0, + "repeat_count": 3.0, + "routers_loss": 0.039019811898469925, + "skip_count": 4.0, + "step": 7762, + "text_loss": 0.3105330467224121 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00018668469350455048, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12519357.0, + "repeat_count": 0.0, + "routers_loss": 0.002269966993480921, + "skip_count": 0.0, + "step": 7764, + "text_loss": 0.3700210452079773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00018644354224855414, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12522072.0, + "repeat_count": 0.0, + "routers_loss": 0.001265842467546463, + "skip_count": 0.0, + "step": 7766, + "text_loss": 0.6737633943557739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00018620251114798386, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12524999.0, + "repeat_count": 0.0, + "routers_loss": 0.006547329016029835, + "skip_count": 1.0, + "step": 7768, + "text_loss": 0.24906545877456665 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001859616002952033, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 12527785.0, + "repeat_count": 2.0, + "routers_loss": 0.010791841894388199, + "skip_count": 3.0, + "step": 7770, + "text_loss": 0.3069820702075958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.0001857208097825299, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12530801.0, + "repeat_count": 0.0, + "routers_loss": 0.00492103723809123, + "skip_count": 2.0, + "step": 7772, + "text_loss": 0.2524295151233673 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.0001854801397022351, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12533919.0, + "repeat_count": 0.0, + "routers_loss": 0.001942967064678669, + "skip_count": 0.0, + "step": 7774, + "text_loss": 0.7855241894721985 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018523959014654407, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12537265.0, + "repeat_count": 2.0, + "routers_loss": 0.00987488217651844, + "skip_count": 2.0, + "step": 7776, + "text_loss": 0.2767317593097687 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041748046875, + "learning_rate": 0.00018499916120763582, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12539695.0, + "repeat_count": 0.0, + "routers_loss": 0.0054283770732581615, + "skip_count": 1.0, + "step": 7778, + "text_loss": 0.43287888169288635 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00018475885297764306, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12542881.0, + "repeat_count": 2.0, + "routers_loss": 0.00797359924763441, + "skip_count": 0.0, + "step": 7780, + "text_loss": 0.3738224506378174 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0001845186655486527, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 12546530.0, + "repeat_count": 0.0, + "routers_loss": 0.0045951665379107, + "skip_count": 0.0, + "step": 7782, + "text_loss": 0.2511517107486725 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 36.54476078661579, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00018427859901270482, + "loss": 0.0055, + "macro_f1": 0.9452888369560242, + "num_tokens": 12549439.0, + "repeat_count": 1.0, + "routers_loss": 0.02312052994966507, + "skip_count": 4.0, + "step": 7784, + "text_loss": 0.3837030827999115 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 27.0, + "epoch": 36.55415321397123, + "f1_execute": 0.9795917868614197, + "f1_repeat": 1.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.059814453125, + "learning_rate": 0.00018403865346179344, + "loss": 0.0066, + "macro_f1": 0.9265305995941162, + "num_tokens": 12553211.0, + "repeat_count": 1.0, + "routers_loss": 0.014698561280965805, + "skip_count": 3.0, + "step": 7786, + "text_loss": 0.510159432888031 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 36.563545641326684, + "f1_execute": 0.9743589162826538, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0390625, + "learning_rate": 0.00018379882898786603, + "loss": 0.0075, + "macro_f1": 0.8803418874740601, + "num_tokens": 12556497.0, + "repeat_count": 2.0, + "routers_loss": 0.023926246911287308, + "skip_count": 7.0, + "step": 7788, + "text_loss": 0.44811317324638367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00018355912568282384, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 12559778.0, + "repeat_count": 0.0, + "routers_loss": 0.0011187797645106912, + "skip_count": 0.0, + "step": 7790, + "text_loss": 0.32099616527557373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00018331954363852166, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12562610.0, + "repeat_count": 0.0, + "routers_loss": 0.0005356677575036883, + "skip_count": 0.0, + "step": 7792, + "text_loss": 0.9754356145858765 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001830800829467677, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 12565886.0, + "repeat_count": 2.0, + "routers_loss": 0.0017101728590205312, + "skip_count": 0.0, + "step": 7794, + "text_loss": 0.4234761595726013 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00018284074369932386, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12568728.0, + "repeat_count": 0.0, + "routers_loss": 0.0012841494753956795, + "skip_count": 0.0, + "step": 7796, + "text_loss": 0.41109147667884827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001826015259879053, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12572231.0, + "repeat_count": 0.0, + "routers_loss": 0.0022388407960534096, + "skip_count": 0.0, + "step": 7798, + "text_loss": 0.5459926128387451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02734375, + "learning_rate": 0.00018236242990418074, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 12574968.0, + "repeat_count": 0.0, + "routers_loss": 0.0019992550369352102, + "skip_count": 0.0, + "step": 7800, + "text_loss": 0.5028481483459473 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001821234555397722, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12579074.0, + "repeat_count": 0.0, + "routers_loss": 0.002936388598755002, + "skip_count": 2.0, + "step": 7802, + "text_loss": 0.2377086579799652 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00018188460298625503, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12581912.0, + "repeat_count": 1.0, + "routers_loss": 0.0026762608904391527, + "skip_count": 0.0, + "step": 7804, + "text_loss": 0.13887254893779755 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 30.0, + "epoch": 36.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00018164587233515824, + "loss": 0.0057, + "macro_f1": 1.0, + "num_tokens": 12585020.0, + "repeat_count": 3.0, + "routers_loss": 0.003901638789102435, + "skip_count": 1.0, + "step": 7806, + "text_loss": 0.35454171895980835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00018140726367796373, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12588310.0, + "repeat_count": 0.0, + "routers_loss": 0.0031358697451651096, + "skip_count": 2.0, + "step": 7808, + "text_loss": 0.3567306697368622 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00018116877710610673, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12591735.0, + "repeat_count": 0.0, + "routers_loss": 0.002310588024556637, + "skip_count": 1.0, + "step": 7810, + "text_loss": 0.45357072353363037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00018093041271097582, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12595232.0, + "repeat_count": 0.0, + "routers_loss": 0.005600228440016508, + "skip_count": 2.0, + "step": 7812, + "text_loss": 0.4179847836494446 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.685647196947464, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00018069217058391267, + "loss": 0.006, + "macro_f1": 0.6603773832321167, + "num_tokens": 12598367.0, + "repeat_count": 1.0, + "routers_loss": 0.04015933722257614, + "skip_count": 1.0, + "step": 7814, + "text_loss": 0.17874565720558167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00018045405081621214, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12601864.0, + "repeat_count": 0.0, + "routers_loss": 0.005119446665048599, + "skip_count": 1.0, + "step": 7816, + "text_loss": 0.6867854595184326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00018021605349912207, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 12605268.0, + "repeat_count": 0.0, + "routers_loss": 0.0005990012432448566, + "skip_count": 0.0, + "step": 7818, + "text_loss": 0.9084970355033875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00017997817872384358, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12608093.0, + "repeat_count": 0.0, + "routers_loss": 0.008712377399206161, + "skip_count": 1.0, + "step": 7820, + "text_loss": 0.19413328170776367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00017974042658153066, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12611001.0, + "repeat_count": 0.0, + "routers_loss": 0.007535711396485567, + "skip_count": 1.0, + "step": 7822, + "text_loss": 0.2672932744026184 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0001795027971632905, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12614584.0, + "repeat_count": 1.0, + "routers_loss": 0.006770546548068523, + "skip_count": 3.0, + "step": 7824, + "text_loss": 0.22805163264274597 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00017926529056018297, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12617519.0, + "repeat_count": 0.0, + "routers_loss": 0.0010458873584866524, + "skip_count": 0.0, + "step": 7826, + "text_loss": 0.385499507188797 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 0.00017902790686322102, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 12621566.0, + "repeat_count": 1.0, + "routers_loss": 0.00634258147329092, + "skip_count": 0.0, + "step": 7828, + "text_loss": 0.8044118285179138 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00017879064616337076, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12624751.0, + "repeat_count": 0.0, + "routers_loss": 0.0053052278235554695, + "skip_count": 3.0, + "step": 7830, + "text_loss": 0.264322966337204 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00017855350855155088, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 12628478.0, + "repeat_count": 0.0, + "routers_loss": 0.0028291696216911077, + "skip_count": 0.0, + "step": 7832, + "text_loss": 0.20611460506916046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 36.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00017831649411863287, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12632027.0, + "repeat_count": 0.0, + "routers_loss": 0.0009586421074345708, + "skip_count": 1.0, + "step": 7834, + "text_loss": 0.4119716286659241 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00017807960295544118, + "loss": 0.0071, + "macro_f1": 0.6666666865348816, + "num_tokens": 12635144.0, + "repeat_count": 0.0, + "routers_loss": 0.012304541654884815, + "skip_count": 2.0, + "step": 7836, + "text_loss": 0.28647977113723755 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001778428351527529, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12638719.0, + "repeat_count": 0.0, + "routers_loss": 0.005212076939642429, + "skip_count": 2.0, + "step": 7838, + "text_loss": 0.630459189414978 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.0001776061908012979, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12642119.0, + "repeat_count": 0.0, + "routers_loss": 0.00183707510586828, + "skip_count": 0.0, + "step": 7840, + "text_loss": 0.5905961990356445 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 36.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001773696699917588, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 12645077.0, + "repeat_count": 1.0, + "routers_loss": 0.0058263009414076805, + "skip_count": 0.0, + "step": 7842, + "text_loss": 0.41949576139450073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00017713327281477077, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12648964.0, + "repeat_count": 0.0, + "routers_loss": 0.001586507773026824, + "skip_count": 0.0, + "step": 7844, + "text_loss": 0.5048848390579224 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00017689699936092163, + "loss": 0.0076, + "macro_f1": 0.3333333432674408, + "num_tokens": 12651934.0, + "repeat_count": 0.0, + "routers_loss": 0.002397194504737854, + "skip_count": 0.0, + "step": 7846, + "text_loss": 0.23879878222942352 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.84531846199002, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0001766608497207518, + "loss": 0.0054, + "macro_f1": 0.5492662787437439, + "num_tokens": 12654907.0, + "repeat_count": 0.0, + "routers_loss": 0.016742069274187088, + "skip_count": 2.0, + "step": 7848, + "text_loss": 0.23400072753429413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0001764248239847544, + "loss": 0.0085, + "macro_f1": 0.6666666865348816, + "num_tokens": 12658765.0, + "repeat_count": 0.0, + "routers_loss": 0.007037387229502201, + "skip_count": 2.0, + "step": 7850, + "text_loss": 0.26165497303009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 36.86410331670091, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.017822265625, + "learning_rate": 0.00017618892224337463, + "loss": 0.0044, + "macro_f1": 0.5492662787437439, + "num_tokens": 12662024.0, + "repeat_count": 0.0, + "routers_loss": 0.017352160066366196, + "skip_count": 2.0, + "step": 7852, + "text_loss": 0.23813043534755707 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 36.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017595314458701084, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12665751.0, + "repeat_count": 0.0, + "routers_loss": 0.005349365528672934, + "skip_count": 3.0, + "step": 7854, + "text_loss": 0.14920757710933685 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00017571749110601337, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 12668823.0, + "repeat_count": 0.0, + "routers_loss": 0.0037689812015742064, + "skip_count": 2.0, + "step": 7856, + "text_loss": 0.2198697030544281 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017548196189068506, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12672367.0, + "repeat_count": 0.0, + "routers_loss": 0.0006363615393638611, + "skip_count": 0.0, + "step": 7858, + "text_loss": 0.5338839888572693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00017524655703128112, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 12675217.0, + "repeat_count": 0.0, + "routers_loss": 0.002691479865461588, + "skip_count": 0.0, + "step": 7860, + "text_loss": 0.17463763058185577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00017501127661800908, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12678796.0, + "repeat_count": 0.0, + "routers_loss": 0.002262329449877143, + "skip_count": 0.0, + "step": 7862, + "text_loss": 0.4637797474861145 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03564453125, + "learning_rate": 0.00017477612074102899, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12681631.0, + "repeat_count": 0.0, + "routers_loss": 0.00115531450137496, + "skip_count": 0.0, + "step": 7864, + "text_loss": 0.6089238524436951 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00017454108949045295, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 12685647.0, + "repeat_count": 0.0, + "routers_loss": 0.00260268640704453, + "skip_count": 0.0, + "step": 7866, + "text_loss": 0.5876018404960632 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00017430618295634514, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12688995.0, + "repeat_count": 0.0, + "routers_loss": 0.002731681102886796, + "skip_count": 0.0, + "step": 7868, + "text_loss": 0.35076001286506653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 36.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00017407140122872262, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 12692100.0, + "repeat_count": 1.0, + "routers_loss": 0.003314645728096366, + "skip_count": 1.0, + "step": 7870, + "text_loss": 0.5313478112220764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.958027590255355, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017383674439755393, + "loss": 0.0069, + "macro_f1": 0.3272727429866791, + "num_tokens": 12695117.0, + "repeat_count": 0.0, + "routers_loss": 0.010385016910731792, + "skip_count": 1.0, + "step": 7872, + "text_loss": 0.5092368125915527 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00017360221255276016, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 12697678.0, + "repeat_count": 0.0, + "routers_loss": 0.001273582922294736, + "skip_count": 0.0, + "step": 7874, + "text_loss": 0.5282881855964661 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 36.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.00017336780578421418, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 12702132.0, + "repeat_count": 0.0, + "routers_loss": 0.0007510313298553228, + "skip_count": 0.0, + "step": 7876, + "text_loss": 0.49093571305274963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 36.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001731335241817412, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 12705413.0, + "repeat_count": 0.0, + "routers_loss": 0.005138787440955639, + "skip_count": 2.0, + "step": 7878, + "text_loss": 0.7503541111946106 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 36.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001728993678351184, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12708310.0, + "repeat_count": 2.0, + "routers_loss": 0.004379773512482643, + "skip_count": 0.0, + "step": 7880, + "text_loss": 0.5942456126213074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001726653368340747, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 12711043.0, + "repeat_count": 0.0, + "routers_loss": 0.005271450616419315, + "skip_count": 2.0, + "step": 7882, + "text_loss": 0.348360538482666 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00017243143126829163, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 12714473.0, + "repeat_count": 1.0, + "routers_loss": 0.0015764752170071006, + "skip_count": 1.0, + "step": 7884, + "text_loss": 0.45971861481666565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.000172197651227402, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 12717832.0, + "repeat_count": 0.0, + "routers_loss": 0.00040649910806678236, + "skip_count": 0.0, + "step": 7886, + "text_loss": 0.5996841788291931 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 0.00017196399680099078, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12720479.0, + "repeat_count": 0.0, + "routers_loss": 0.00473182974383235, + "skip_count": 2.0, + "step": 7888, + "text_loss": 0.40346208214759827 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00017173046807859483, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 12723104.0, + "repeat_count": 0.0, + "routers_loss": 0.0020138369873166084, + "skip_count": 0.0, + "step": 7890, + "text_loss": 0.6878634095191956 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.05165835045494, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001714970651497027, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 12725967.0, + "repeat_count": 0.0, + "routers_loss": 0.008381367661058903, + "skip_count": 1.0, + "step": 7892, + "text_loss": 0.9161711931228638 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00017126378810375498, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 12728819.0, + "repeat_count": 1.0, + "routers_loss": 0.0037658829241991043, + "skip_count": 0.0, + "step": 7894, + "text_loss": 0.4447716772556305 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00017103063703014372, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 12731806.0, + "repeat_count": 0.0, + "routers_loss": 0.0022742559667676687, + "skip_count": 0.0, + "step": 7896, + "text_loss": 0.9140825867652893 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00017079761201821298, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 12734649.0, + "repeat_count": 0.0, + "routers_loss": 0.002157264854758978, + "skip_count": 0.0, + "step": 7898, + "text_loss": 0.268303781747818 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001705647131572583, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12737889.0, + "repeat_count": 1.0, + "routers_loss": 0.01064873393625021, + "skip_count": 1.0, + "step": 7900, + "text_loss": 0.36009490489959717 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.00017033194053652685, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 12740821.0, + "repeat_count": 1.0, + "routers_loss": 0.0062920586206018925, + "skip_count": 0.0, + "step": 7902, + "text_loss": 0.5301805138587952 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0615234375, + "learning_rate": 0.00017009929424521782, + "loss": 0.0063, + "macro_f1": 1.0, + "num_tokens": 12743876.0, + "repeat_count": 1.0, + "routers_loss": 0.0033694824669510126, + "skip_count": 1.0, + "step": 7904, + "text_loss": 1.026949167251587 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.117405341943055, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 0.00016986677437248155, + "loss": 0.0071, + "macro_f1": 0.8817967176437378, + "num_tokens": 12747623.0, + "repeat_count": 2.0, + "routers_loss": 0.05076088383793831, + "skip_count": 3.0, + "step": 7906, + "text_loss": 0.33465588092803955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016963438100742014, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12751255.0, + "repeat_count": 0.0, + "routers_loss": 0.0005921403644606471, + "skip_count": 0.0, + "step": 7908, + "text_loss": 0.3498881757259369 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.00016940211423908713, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 12754297.0, + "repeat_count": 0.0, + "routers_loss": 0.004132566973567009, + "skip_count": 0.0, + "step": 7910, + "text_loss": 0.2874198853969574 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.0001691699741564876, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12756969.0, + "repeat_count": 0.0, + "routers_loss": 0.0024724705144762993, + "skip_count": 1.0, + "step": 7912, + "text_loss": 0.10593545436859131 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016893796084857806, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12760261.0, + "repeat_count": 0.0, + "routers_loss": 0.002991671208292246, + "skip_count": 0.0, + "step": 7914, + "text_loss": 0.1331545114517212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00016870607440426643, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12762971.0, + "repeat_count": 0.0, + "routers_loss": 0.0018167285015806556, + "skip_count": 0.0, + "step": 7916, + "text_loss": 0.496826171875 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02099609375, + "learning_rate": 0.00016847431491241207, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 12765949.0, + "repeat_count": 1.0, + "routers_loss": 0.0033364067785441875, + "skip_count": 0.0, + "step": 7918, + "text_loss": 0.43522849678993225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0001682426824618256, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 12769201.0, + "repeat_count": 0.0, + "routers_loss": 0.001313596498221159, + "skip_count": 0.0, + "step": 7920, + "text_loss": 0.8691539168357849 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.19254476078662, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00016801117714126908, + "loss": 0.0108, + "macro_f1": 0.6603773832321167, + "num_tokens": 12773308.0, + "repeat_count": 1.0, + "routers_loss": 0.02579287625849247, + "skip_count": 1.0, + "step": 7922, + "text_loss": 0.275301069021225 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00016777979903945568, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 12776166.0, + "repeat_count": 0.0, + "routers_loss": 0.010501758195459843, + "skip_count": 1.0, + "step": 7924, + "text_loss": 0.32124993205070496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001675485482450499, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 12779965.0, + "repeat_count": 0.0, + "routers_loss": 0.0063389060087502, + "skip_count": 2.0, + "step": 7926, + "text_loss": 0.2527695894241333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00016731742484666774, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12783019.0, + "repeat_count": 0.0, + "routers_loss": 0.002796935848891735, + "skip_count": 0.0, + "step": 7928, + "text_loss": 0.18767669796943665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001670864289328759, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12786291.0, + "repeat_count": 0.0, + "routers_loss": 0.007973561994731426, + "skip_count": 2.0, + "step": 7930, + "text_loss": 0.29628485441207886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.027099609375, + "learning_rate": 0.00016685556059219253, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 12789566.0, + "repeat_count": 4.0, + "routers_loss": 0.011405733413994312, + "skip_count": 6.0, + "step": 7932, + "text_loss": 0.16635073721408844 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00016662481991308682, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12792533.0, + "repeat_count": 0.0, + "routers_loss": 0.0012368770549073815, + "skip_count": 1.0, + "step": 7934, + "text_loss": 0.4196353852748871 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0263671875, + "learning_rate": 0.000166394206983979, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 12795619.0, + "repeat_count": 0.0, + "routers_loss": 0.0036002211272716522, + "skip_count": 1.0, + "step": 7936, + "text_loss": 0.17559808492660522 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00016616372189324035, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 12799702.0, + "repeat_count": 1.0, + "routers_loss": 0.0039332108572125435, + "skip_count": 0.0, + "step": 7938, + "text_loss": 0.603410542011261 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00016593336472919324, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 12802704.0, + "repeat_count": 0.0, + "routers_loss": 0.0008303318754769862, + "skip_count": 0.0, + "step": 7940, + "text_loss": 0.5331749320030212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.28646903434106, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00016570313558011098, + "loss": 0.0058, + "macro_f1": 0.6601307392120361, + "num_tokens": 12805630.0, + "repeat_count": 1.0, + "routers_loss": 0.05092398822307587, + "skip_count": 2.0, + "step": 7942, + "text_loss": 0.17398510873317719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00016547303453421774, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12809065.0, + "repeat_count": 0.0, + "routers_loss": 0.0006886976188980043, + "skip_count": 0.0, + "step": 7944, + "text_loss": 0.3419797718524933 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.044677734375, + "learning_rate": 0.00016524306167968878, + "loss": 0.007, + "macro_f1": 1.0, + "num_tokens": 12812641.0, + "repeat_count": 1.0, + "routers_loss": 0.005634502973407507, + "skip_count": 3.0, + "step": 7946, + "text_loss": 0.5877651572227478 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00016501321710465005, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 12815527.0, + "repeat_count": 0.0, + "routers_loss": 0.0020598487462848425, + "skip_count": 0.0, + "step": 7948, + "text_loss": 0.3558528423309326 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001647835008971783, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12819103.0, + "repeat_count": 0.0, + "routers_loss": 0.005946476943790913, + "skip_count": 2.0, + "step": 7950, + "text_loss": 0.5800213813781738 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00016455391314530154, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12822423.0, + "repeat_count": 0.0, + "routers_loss": 0.010360358282923698, + "skip_count": 2.0, + "step": 7952, + "text_loss": 0.278255820274353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00016432445393699802, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12826180.0, + "repeat_count": 0.0, + "routers_loss": 0.003017681185156107, + "skip_count": 0.0, + "step": 7954, + "text_loss": 0.1571389138698578 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00016409512336019698, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12829196.0, + "repeat_count": 0.0, + "routers_loss": 0.0008854938205331564, + "skip_count": 0.0, + "step": 7956, + "text_loss": 0.2776578366756439 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 0.00016386592150277834, + "loss": 0.0092, + "macro_f1": 0.3333333432674408, + "num_tokens": 12831983.0, + "repeat_count": 0.0, + "routers_loss": 0.0023990103509277105, + "skip_count": 0.0, + "step": 7958, + "text_loss": 0.46686989068984985 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 37.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001636368484525727, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 12834889.0, + "repeat_count": 0.0, + "routers_loss": 0.009835032746195793, + "skip_count": 5.0, + "step": 7960, + "text_loss": 0.22224856913089752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00016340790429736118, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12837950.0, + "repeat_count": 0.0, + "routers_loss": 0.0018618656322360039, + "skip_count": 0.0, + "step": 7962, + "text_loss": 0.5101882815361023 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00016317908912487578, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 12840981.0, + "repeat_count": 1.0, + "routers_loss": 0.001275144051760435, + "skip_count": 1.0, + "step": 7964, + "text_loss": 0.40567103028297424 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016295040302279873, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12844044.0, + "repeat_count": 0.0, + "routers_loss": 0.003117429558187723, + "skip_count": 2.0, + "step": 7966, + "text_loss": 0.6888198852539062 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.00016272184607876312, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12847350.0, + "repeat_count": 2.0, + "routers_loss": 0.006585797294974327, + "skip_count": 4.0, + "step": 7968, + "text_loss": 0.19813506305217743 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.0001624934183803523, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12850285.0, + "repeat_count": 1.0, + "routers_loss": 0.0043576788157224655, + "skip_count": 1.0, + "step": 7970, + "text_loss": 0.6108269691467285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.427355444672735, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00016226512001510024, + "loss": 0.0039, + "macro_f1": 0.5492662787437439, + "num_tokens": 12853993.0, + "repeat_count": 0.0, + "routers_loss": 0.011879517696797848, + "skip_count": 2.0, + "step": 7972, + "text_loss": 0.42478689551353455 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00016203695107049117, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 12857022.0, + "repeat_count": 0.0, + "routers_loss": 0.0016375730047002435, + "skip_count": 0.0, + "step": 7974, + "text_loss": 0.5130020976066589 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.045654296875, + "learning_rate": 0.0001618089116339601, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 12860764.0, + "repeat_count": 0.0, + "routers_loss": 0.0006649247952736914, + "skip_count": 0.0, + "step": 7976, + "text_loss": 1.0629136562347412 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.455532726739065, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00016158100179289208, + "loss": 0.0062, + "macro_f1": 0.6603773832321167, + "num_tokens": 12864066.0, + "repeat_count": 1.0, + "routers_loss": 0.03140667825937271, + "skip_count": 1.0, + "step": 7978, + "text_loss": 0.4241345226764679 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 37.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 0.0001613532216346226, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 12867555.0, + "repeat_count": 0.0, + "routers_loss": 0.010257012210786343, + "skip_count": 4.0, + "step": 7980, + "text_loss": 0.6085613369941711 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001611255712464374, + "loss": 0.0037, + "macro_f1": 0.6666666865348816, + "num_tokens": 12871415.0, + "repeat_count": 0.0, + "routers_loss": 0.00783725269138813, + "skip_count": 1.0, + "step": 7982, + "text_loss": 0.15661844611167908 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 0.00016089805071557256, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 12874195.0, + "repeat_count": 1.0, + "routers_loss": 0.0027650597039610147, + "skip_count": 2.0, + "step": 7984, + "text_loss": 0.4938865005970001 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.49310243616085, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016067066012921439, + "loss": 0.0083, + "macro_f1": 0.3272727429866791, + "num_tokens": 12878084.0, + "repeat_count": 1.0, + "routers_loss": 0.04647083953022957, + "skip_count": 0.0, + "step": 7986, + "text_loss": 0.2973119020462036 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 0.00016044339957449938, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 12881182.0, + "repeat_count": 0.0, + "routers_loss": 0.002192265819758177, + "skip_count": 0.0, + "step": 7988, + "text_loss": 0.2623208165168762 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00016021626913851418, + "loss": 0.0031, + "macro_f1": 0.3333333432674408, + "num_tokens": 12884028.0, + "repeat_count": 0.0, + "routers_loss": 0.0023096329532563686, + "skip_count": 0.0, + "step": 7990, + "text_loss": 0.3752247989177704 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.52127971822718, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015998926890829562, + "loss": 0.0046, + "macro_f1": 0.3272727429866791, + "num_tokens": 12887759.0, + "repeat_count": 0.0, + "routers_loss": 0.03038526326417923, + "skip_count": 1.0, + "step": 7992, + "text_loss": 0.2609226405620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001597623989708306, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 12890976.0, + "repeat_count": 0.0, + "routers_loss": 0.0015199477784335613, + "skip_count": 0.0, + "step": 7994, + "text_loss": 0.6512867212295532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00015953565941305615, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 12894112.0, + "repeat_count": 0.0, + "routers_loss": 0.0024166766088455915, + "skip_count": 0.0, + "step": 7996, + "text_loss": 0.5539866089820862 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.05908203125, + "learning_rate": 0.0001593090503218591, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 12896857.0, + "repeat_count": 1.0, + "routers_loss": 0.005081235896795988, + "skip_count": 2.0, + "step": 7998, + "text_loss": 0.6631022691726685 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00015908257178407682, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 12900075.0, + "repeat_count": 1.0, + "routers_loss": 0.0024711282458156347, + "skip_count": 0.0, + "step": 8000, + "text_loss": 0.3309785723686218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.5682418550044, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 0.00015885622388649617, + "loss": 0.0059, + "macro_f1": 0.6601307392120361, + "num_tokens": 12903845.0, + "repeat_count": 1.0, + "routers_loss": 0.04024988412857056, + "skip_count": 2.0, + "step": 8002, + "text_loss": 0.2384071946144104 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.045166015625, + "learning_rate": 0.00015863000671585405, + "loss": 0.008, + "macro_f1": 1.0, + "num_tokens": 12907694.0, + "repeat_count": 1.0, + "routers_loss": 0.001953886589035392, + "skip_count": 2.0, + "step": 8004, + "text_loss": 0.5001366138458252 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00015840392035883726, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 12910871.0, + "repeat_count": 0.0, + "routers_loss": 0.002982128644362092, + "skip_count": 2.0, + "step": 8006, + "text_loss": 0.2589346170425415 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001581779649020827, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 12914484.0, + "repeat_count": 0.0, + "routers_loss": 0.0009384988807141781, + "skip_count": 0.0, + "step": 8008, + "text_loss": 0.5727795362472534 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00015795214043217654, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 12917480.0, + "repeat_count": 0.0, + "routers_loss": 0.008854437619447708, + "skip_count": 2.0, + "step": 8010, + "text_loss": 0.24354904890060425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00015772644703565563, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 12920383.0, + "repeat_count": 0.0, + "routers_loss": 0.001689503900706768, + "skip_count": 0.0, + "step": 8012, + "text_loss": 0.5372336506843567 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 0.00015750088479900588, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 12923886.0, + "repeat_count": 0.0, + "routers_loss": 0.002284591319039464, + "skip_count": 0.0, + "step": 8014, + "text_loss": 0.1708722710609436 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 37.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00015727545380866316, + "loss": 0.0042, + "macro_f1": 1.0, + "num_tokens": 12926998.0, + "repeat_count": 1.0, + "routers_loss": 0.004594483878463507, + "skip_count": 4.0, + "step": 8016, + "text_loss": 0.26784324645996094 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.0001570501541510131, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 12929726.0, + "repeat_count": 1.0, + "routers_loss": 0.0021998141892254353, + "skip_count": 0.0, + "step": 8018, + "text_loss": 0.8051869869232178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00015682498591239086, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 12932182.0, + "repeat_count": 0.0, + "routers_loss": 0.0032623414881527424, + "skip_count": 1.0, + "step": 8020, + "text_loss": 0.8431181907653809 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00015659994917908144, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 12935338.0, + "repeat_count": 0.0, + "routers_loss": 0.0014909361489117146, + "skip_count": 1.0, + "step": 8022, + "text_loss": 0.6168642640113831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 0.0001563750440373191, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 12938484.0, + "repeat_count": 0.0, + "routers_loss": 0.0010295510292053223, + "skip_count": 0.0, + "step": 8024, + "text_loss": 0.2694014608860016 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.68095098326974, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.029296875, + "learning_rate": 0.00015615027057328828, + "loss": 0.0066, + "macro_f1": 0.5492662787437439, + "num_tokens": 12942045.0, + "repeat_count": 0.0, + "routers_loss": 0.018341995775699615, + "skip_count": 2.0, + "step": 8026, + "text_loss": 0.8151478171348572 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 37.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0001559256288731224, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 12945547.0, + "repeat_count": 2.0, + "routers_loss": 0.0023289949167519808, + "skip_count": 1.0, + "step": 8028, + "text_loss": 0.613464891910553 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015570111902290463, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 12949544.0, + "repeat_count": 0.0, + "routers_loss": 0.006635872647166252, + "skip_count": 2.0, + "step": 8030, + "text_loss": 0.17417465150356293 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.04931640625, + "learning_rate": 0.00015547674110866756, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 12952838.0, + "repeat_count": 1.0, + "routers_loss": 0.006023989990353584, + "skip_count": 1.0, + "step": 8032, + "text_loss": 0.4801837205886841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00015525249521639319, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 12956329.0, + "repeat_count": 0.0, + "routers_loss": 0.005706884432584047, + "skip_count": 0.0, + "step": 8034, + "text_loss": 0.2028084248304367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0242919921875, + "learning_rate": 0.000155028381432013, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 12959122.0, + "repeat_count": 0.0, + "routers_loss": 0.003527123713865876, + "skip_count": 2.0, + "step": 8036, + "text_loss": 0.39474430680274963 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.00015480439984140776, + "loss": 0.0029, + "macro_f1": 1.0, + "num_tokens": 12962546.0, + "repeat_count": 1.0, + "routers_loss": 0.010415437631309032, + "skip_count": 2.0, + "step": 8038, + "text_loss": 0.20412345230579376 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0380859375, + "learning_rate": 0.0001545805505304077, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 12965861.0, + "repeat_count": 0.0, + "routers_loss": 0.001566931139677763, + "skip_count": 0.0, + "step": 8040, + "text_loss": 0.5129821300506592 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 31.0, + "epoch": 37.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0001543568335847923, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 12968677.0, + "repeat_count": 3.0, + "routers_loss": 0.0037196793127804995, + "skip_count": 0.0, + "step": 8042, + "text_loss": 0.755020260810852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00015413324909029031, + "loss": 0.0086, + "macro_f1": 0.3333333432674408, + "num_tokens": 12972001.0, + "repeat_count": 0.0, + "routers_loss": 0.0010940275387838483, + "skip_count": 0.0, + "step": 8044, + "text_loss": 0.48672133684158325 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00015390979713257968, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 12974765.0, + "repeat_count": 0.0, + "routers_loss": 0.011106903664767742, + "skip_count": 1.0, + "step": 8046, + "text_loss": 0.1727766990661621 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 0.8333333134651184, + "avg_layers": 25.0, + "epoch": 37.78426768417963, + "f1_execute": 0.949999988079071, + "f1_repeat": 0.800000011920929, + "f1_skip": 0.9090909361839294, + "grad_norm": 0.048828125, + "learning_rate": 0.00015368647779728757, + "loss": 0.006, + "macro_f1": 0.886363685131073, + "num_tokens": 12979127.0, + "repeat_count": 3.0, + "routers_loss": 0.05134248360991478, + "skip_count": 6.0, + "step": 8048, + "text_loss": 0.33233317732810974 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00015346329116999057, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 12982812.0, + "repeat_count": 0.0, + "routers_loss": 0.0027500339783728123, + "skip_count": 0.0, + "step": 8050, + "text_loss": 0.8176849484443665 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.80305253889052, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00015324023733621412, + "loss": 0.005, + "macro_f1": 0.32098764181137085, + "num_tokens": 12985740.0, + "repeat_count": 0.0, + "routers_loss": 0.030734945088624954, + "skip_count": 2.0, + "step": 8052, + "text_loss": 0.38721024990081787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 37.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00015301731638143285, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 12988646.0, + "repeat_count": 0.0, + "routers_loss": 0.002358534839004278, + "skip_count": 2.0, + "step": 8054, + "text_loss": 0.5656245946884155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.060791015625, + "learning_rate": 0.0001527945283910705, + "loss": 0.0074, + "macro_f1": 1.0, + "num_tokens": 12991518.0, + "repeat_count": 2.0, + "routers_loss": 0.007991814985871315, + "skip_count": 3.0, + "step": 8056, + "text_loss": 0.26438817381858826 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 37.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0439453125, + "learning_rate": 0.00015257187345049983, + "loss": 0.0079, + "macro_f1": 1.0, + "num_tokens": 12994847.0, + "repeat_count": 1.0, + "routers_loss": 0.011761264875531197, + "skip_count": 1.0, + "step": 8058, + "text_loss": 0.1801673173904419 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.5, + "avg_layers": 28.0, + "epoch": 37.8406222483123, + "f1_execute": 0.9803921580314636, + "f1_repeat": 1.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0001523493516450427, + "loss": 0.004, + "macro_f1": 0.8823530077934265, + "num_tokens": 12997874.0, + "repeat_count": 1.0, + "routers_loss": 0.021669765934348106, + "skip_count": 2.0, + "step": 8060, + "text_loss": 0.3278379738330841 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001521269630599698, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13000504.0, + "repeat_count": 0.0, + "routers_loss": 0.002388916676864028, + "skip_count": 0.0, + "step": 8062, + "text_loss": 0.5396623611450195 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00015190470778050086, + "loss": 0.007, + "macro_f1": 0.6666666865348816, + "num_tokens": 13003620.0, + "repeat_count": 0.0, + "routers_loss": 0.007719808723777533, + "skip_count": 1.0, + "step": 8064, + "text_loss": 0.1989232450723648 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00015168258589180462, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 13007410.0, + "repeat_count": 0.0, + "routers_loss": 0.0007461659261025488, + "skip_count": 0.0, + "step": 8066, + "text_loss": 0.5293997526168823 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 37.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00015146059747899848, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13010240.0, + "repeat_count": 1.0, + "routers_loss": 0.005515575874596834, + "skip_count": 0.0, + "step": 8068, + "text_loss": 0.2776186466217041 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00015123874262714892, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13012728.0, + "repeat_count": 0.0, + "routers_loss": 0.0026730166282504797, + "skip_count": 0.0, + "step": 8070, + "text_loss": 0.5902766585350037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04833984375, + "learning_rate": 0.00015101702142127088, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13015616.0, + "repeat_count": 0.0, + "routers_loss": 0.002244985429570079, + "skip_count": 0.0, + "step": 8072, + "text_loss": 0.21447396278381348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015079543394632878, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13019846.0, + "repeat_count": 0.0, + "routers_loss": 0.001963787479326129, + "skip_count": 0.0, + "step": 8074, + "text_loss": 0.22974267601966858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 37.915761667155856, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.053955078125, + "learning_rate": 0.00015057398028723513, + "loss": 0.0064, + "macro_f1": 0.5492662787437439, + "num_tokens": 13023036.0, + "repeat_count": 0.0, + "routers_loss": 0.02271878905594349, + "skip_count": 2.0, + "step": 8076, + "text_loss": 0.26458361744880676 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.00015035266052885137, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13025840.0, + "repeat_count": 0.0, + "routers_loss": 0.0011732397833839059, + "skip_count": 0.0, + "step": 8078, + "text_loss": 0.44129177927970886 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0001501314747559877, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13030031.0, + "repeat_count": 1.0, + "routers_loss": 0.015655985102057457, + "skip_count": 2.0, + "step": 8080, + "text_loss": 0.28889161348342896 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00014991042305340286, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13033603.0, + "repeat_count": 0.0, + "routers_loss": 0.0012988687958568335, + "skip_count": 0.0, + "step": 8082, + "text_loss": 0.16362667083740234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00014968950550580434, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13036931.0, + "repeat_count": 0.0, + "routers_loss": 0.002425852930173278, + "skip_count": 0.0, + "step": 8084, + "text_loss": 0.35900676250457764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 37.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.0001494687221978482, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13040637.0, + "repeat_count": 0.0, + "routers_loss": 0.004092676565051079, + "skip_count": 1.0, + "step": 8086, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014924807321413893, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13043855.0, + "repeat_count": 0.0, + "routers_loss": 0.0009040542645379901, + "skip_count": 0.0, + "step": 8088, + "text_loss": 0.30341213941574097 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0001490275586392296, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13046903.0, + "repeat_count": 0.0, + "routers_loss": 0.0019248841563239694, + "skip_count": 0.0, + "step": 8090, + "text_loss": 0.4299648702144623 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 37.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.000148807178557622, + "loss": 0.0087, + "macro_f1": 0.3333333432674408, + "num_tokens": 13050219.0, + "repeat_count": 0.0, + "routers_loss": 0.0008314658771269023, + "skip_count": 0.0, + "step": 8092, + "text_loss": 0.4521652162075043 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.0, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00014858693305376598, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13053076.0, + "repeat_count": 0.0, + "routers_loss": 0.0007470731507055461, + "skip_count": 0.0, + "step": 8094, + "text_loss": 0.46265852451324463 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00014836682221206, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13056170.0, + "repeat_count": 1.0, + "routers_loss": 0.003292408073320985, + "skip_count": 0.0, + "step": 8096, + "text_loss": 0.6483868956565857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00014814684611685124, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13059181.0, + "repeat_count": 0.0, + "routers_loss": 0.001357200788334012, + "skip_count": 0.0, + "step": 8098, + "text_loss": 0.43141183257102966 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0177001953125, + "learning_rate": 0.00014792700485243476, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13062124.0, + "repeat_count": 0.0, + "routers_loss": 0.0030062920413911343, + "skip_count": 0.0, + "step": 8100, + "text_loss": 0.26022693514823914 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0001477072985030542, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13065273.0, + "repeat_count": 0.0, + "routers_loss": 0.0006919128354638815, + "skip_count": 0.0, + "step": 8102, + "text_loss": 0.5927232503890991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00014748772715290144, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13068346.0, + "repeat_count": 0.0, + "routers_loss": 0.005062389187514782, + "skip_count": 0.0, + "step": 8104, + "text_loss": 0.1255214959383011 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 0.00014726829088611664, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13071384.0, + "repeat_count": 0.0, + "routers_loss": 0.0005492564523592591, + "skip_count": 0.0, + "step": 8106, + "text_loss": 0.6445038914680481 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0205078125, + "learning_rate": 0.00014704898978678817, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13074667.0, + "repeat_count": 0.0, + "routers_loss": 0.002470226027071476, + "skip_count": 0.0, + "step": 8108, + "text_loss": 0.5019628405570984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014682982393895256, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13077566.0, + "repeat_count": 0.0, + "routers_loss": 0.0008262090268544853, + "skip_count": 0.0, + "step": 8110, + "text_loss": 0.6075460314750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.084531846199, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 0.00014661079342659467, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13081042.0, + "repeat_count": 0.0, + "routers_loss": 0.00034181721275672317, + "skip_count": 0.0, + "step": 8112, + "text_loss": 0.7349393963813782 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001463918983336474, + "loss": 0.0053, + "macro_f1": 1.0, + "num_tokens": 13084151.0, + "repeat_count": 1.0, + "routers_loss": 0.01406828872859478, + "skip_count": 2.0, + "step": 8114, + "text_loss": 0.3122454285621643 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017822265625, + "learning_rate": 0.00014617313874399173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13086998.0, + "repeat_count": 0.0, + "routers_loss": 0.002714085392653942, + "skip_count": 0.0, + "step": 8116, + "text_loss": 0.6545852422714233 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00014595451474145677, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13090017.0, + "repeat_count": 0.0, + "routers_loss": 0.0073202489875257015, + "skip_count": 0.0, + "step": 8118, + "text_loss": 0.5487201809883118 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00014573602640981947, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13093651.0, + "repeat_count": 0.0, + "routers_loss": 0.000667977670673281, + "skip_count": 0.0, + "step": 8120, + "text_loss": 0.672166109085083 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00014551767383280535, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13097139.0, + "repeat_count": 0.0, + "routers_loss": 0.0020584615413099527, + "skip_count": 0.0, + "step": 8122, + "text_loss": 0.1996239423751831 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.14088641033167, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00014529945709408726, + "loss": 0.0069, + "macro_f1": 0.6598639488220215, + "num_tokens": 13100493.0, + "repeat_count": 1.0, + "routers_loss": 0.013855135068297386, + "skip_count": 3.0, + "step": 8124, + "text_loss": 0.4099486768245697 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0001450813762772863, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13103488.0, + "repeat_count": 0.0, + "routers_loss": 0.0014984552981331944, + "skip_count": 0.0, + "step": 8126, + "text_loss": 0.6307108402252197 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00014486343146597152, + "loss": 0.0054, + "macro_f1": 0.6666666865348816, + "num_tokens": 13106445.0, + "repeat_count": 1.0, + "routers_loss": 0.00430954247713089, + "skip_count": 0.0, + "step": 8128, + "text_loss": 0.6226127743721008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07177734375, + "learning_rate": 0.00014464562274365972, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13109258.0, + "repeat_count": 0.0, + "routers_loss": 0.003711461555212736, + "skip_count": 1.0, + "step": 8130, + "text_loss": 0.17819052934646606 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.17845611975345, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00014442795019381567, + "loss": 0.0064, + "macro_f1": 0.6603773832321167, + "num_tokens": 13114206.0, + "repeat_count": 1.0, + "routers_loss": 0.015719098970294, + "skip_count": 1.0, + "step": 8132, + "text_loss": 0.28450697660446167 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00014421041389985184, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13117351.0, + "repeat_count": 0.0, + "routers_loss": 0.0013113922905176878, + "skip_count": 0.0, + "step": 8134, + "text_loss": 0.310830682516098 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00014399301394512858, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 13120228.0, + "repeat_count": 1.0, + "routers_loss": 0.001965439412742853, + "skip_count": 1.0, + "step": 8136, + "text_loss": 0.8635116815567017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00014377575041295393, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13123380.0, + "repeat_count": 1.0, + "routers_loss": 0.004898902028799057, + "skip_count": 2.0, + "step": 8138, + "text_loss": 0.5302467346191406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0001435586233865836, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13126875.0, + "repeat_count": 0.0, + "routers_loss": 0.00031845085322856903, + "skip_count": 0.0, + "step": 8140, + "text_loss": 0.5913560390472412 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0001433416329492213, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 13129563.0, + "repeat_count": 1.0, + "routers_loss": 0.00298812473192811, + "skip_count": 1.0, + "step": 8142, + "text_loss": 0.5153398513793945 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00014312477918401807, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13132608.0, + "repeat_count": 0.0, + "routers_loss": 0.0026608197949826717, + "skip_count": 1.0, + "step": 8144, + "text_loss": 0.4554155766963959 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014290806217407272, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13136204.0, + "repeat_count": 1.0, + "routers_loss": 0.0027651884593069553, + "skip_count": 1.0, + "step": 8146, + "text_loss": 0.6349515318870544 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00014269148200243148, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13138895.0, + "repeat_count": 0.0, + "routers_loss": 0.0006579195614904165, + "skip_count": 0.0, + "step": 8148, + "text_loss": 0.4629364013671875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.26298796595245, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00014247503875208846, + "loss": 0.0059, + "macro_f1": 0.3272727429866791, + "num_tokens": 13142500.0, + "repeat_count": 1.0, + "routers_loss": 0.023065708577632904, + "skip_count": 0.0, + "step": 8150, + "text_loss": 0.4962928593158722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.038330078125, + "learning_rate": 0.00014225873250598496, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13146203.0, + "repeat_count": 0.0, + "routers_loss": 0.007397830951958895, + "skip_count": 1.0, + "step": 8152, + "text_loss": 0.3225953280925751 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00014204256334700988, + "loss": 0.0077, + "macro_f1": 0.6666666865348816, + "num_tokens": 13149517.0, + "repeat_count": 0.0, + "routers_loss": 0.004839105997234583, + "skip_count": 1.0, + "step": 8154, + "text_loss": 0.18435558676719666 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00014182653135799995, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13152643.0, + "repeat_count": 0.0, + "routers_loss": 0.0028303388971835375, + "skip_count": 4.0, + "step": 8156, + "text_loss": 0.5836900472640991 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001416106366217389, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13155213.0, + "repeat_count": 0.0, + "routers_loss": 0.0004012314020656049, + "skip_count": 0.0, + "step": 8158, + "text_loss": 0.3723861575126648 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 20.0, + "epoch": 38.30995010272967, + "f1_execute": 0.9714285731315613, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 0.0001413948792209579, + "loss": 0.0065, + "macro_f1": 0.8793651461601257, + "num_tokens": 13158440.0, + "repeat_count": 2.0, + "routers_loss": 0.04377155378460884, + "skip_count": 9.0, + "step": 8160, + "text_loss": 0.32476910948753357 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001411792592383357, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13162651.0, + "repeat_count": 0.0, + "routers_loss": 0.0011163362069055438, + "skip_count": 0.0, + "step": 8162, + "text_loss": 0.4890389144420624 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.32873495744057, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014096377675649823, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13165406.0, + "repeat_count": 1.0, + "routers_loss": 0.012117774225771427, + "skip_count": 1.0, + "step": 8164, + "text_loss": 0.7763246893882751 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.33812738479601, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00014074843185801883, + "loss": 0.004, + "macro_f1": 0.9262410998344421, + "num_tokens": 13168402.0, + "repeat_count": 3.0, + "routers_loss": 0.009951545856893063, + "skip_count": 2.0, + "step": 8166, + "text_loss": 0.5038266777992249 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.00014053322462541802, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 13171423.0, + "repeat_count": 1.0, + "routers_loss": 0.0021372761111706495, + "skip_count": 1.0, + "step": 8168, + "text_loss": 0.5634724497795105 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.00014031815514116354, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 13174713.0, + "repeat_count": 0.0, + "routers_loss": 0.0007417177548632026, + "skip_count": 0.0, + "step": 8170, + "text_loss": 0.4009707272052765 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.6666666865348816, + "avg_layers": 26.0, + "epoch": 38.36630466686234, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 0.800000011920929, + "grad_norm": 0.035888671875, + "learning_rate": 0.00014010322348767057, + "loss": 0.0077, + "macro_f1": 0.5934640765190125, + "num_tokens": 13178012.0, + "repeat_count": 0.0, + "routers_loss": 0.01619168184697628, + "skip_count": 3.0, + "step": 8172, + "text_loss": 0.29182371497154236 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00013988842974730137, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13181096.0, + "repeat_count": 0.0, + "routers_loss": 0.0037969043478369713, + "skip_count": 0.0, + "step": 8174, + "text_loss": 0.275851845741272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.385089521573235, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00013967377400236515, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13184116.0, + "repeat_count": 0.0, + "routers_loss": 0.0007759644067846239, + "skip_count": 0.0, + "step": 8176, + "text_loss": 0.7569663524627686 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00013945925633511848, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13187319.0, + "repeat_count": 0.0, + "routers_loss": 0.002708743792027235, + "skip_count": 0.0, + "step": 8178, + "text_loss": 0.4733831286430359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00013924487682776492, + "loss": 0.0084, + "macro_f1": 0.3333333432674408, + "num_tokens": 13190796.0, + "repeat_count": 0.0, + "routers_loss": 0.0005060714902356267, + "skip_count": 0.0, + "step": 8180, + "text_loss": 0.5663171410560608 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.413266803639566, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001390306355624551, + "loss": 0.0049, + "macro_f1": 0.3272727429866791, + "num_tokens": 13193705.0, + "repeat_count": 0.0, + "routers_loss": 0.02932601235806942, + "skip_count": 1.0, + "step": 8182, + "text_loss": 0.30700045824050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001388165326212867, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13196393.0, + "repeat_count": 0.0, + "routers_loss": 0.0011637522839009762, + "skip_count": 0.0, + "step": 8184, + "text_loss": 0.6897354125976562 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03271484375, + "learning_rate": 0.00013860256808630427, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13199526.0, + "repeat_count": 0.0, + "routers_loss": 0.0017184355529025197, + "skip_count": 0.0, + "step": 8186, + "text_loss": 0.6246579885482788 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00013838874203949954, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13202963.0, + "repeat_count": 0.0, + "routers_loss": 0.0026622721925377846, + "skip_count": 0.0, + "step": 8188, + "text_loss": 0.506066083908081 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013817505456281099, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13207408.0, + "repeat_count": 0.0, + "routers_loss": 0.000543750764336437, + "skip_count": 0.0, + "step": 8190, + "text_loss": 0.5192428231239319 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02392578125, + "learning_rate": 0.0001379615057381241, + "loss": 0.0034, + "macro_f1": 0.3333333432674408, + "num_tokens": 13211073.0, + "repeat_count": 0.0, + "routers_loss": 0.0010060713393613696, + "skip_count": 0.0, + "step": 8192, + "text_loss": 0.5640166401863098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03857421875, + "learning_rate": 0.00013774809564727104, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13214203.0, + "repeat_count": 0.0, + "routers_loss": 0.005152868572622538, + "skip_count": 2.0, + "step": 8194, + "text_loss": 0.8643819689750671 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.47901379512768, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001375348243720312, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13217748.0, + "repeat_count": 0.0, + "routers_loss": 0.0017722113989293575, + "skip_count": 2.0, + "step": 8196, + "text_loss": 0.40500834584236145 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 0.0001373216919941304, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 13221341.0, + "repeat_count": 1.0, + "routers_loss": 0.00999271310865879, + "skip_count": 3.0, + "step": 8198, + "text_loss": 0.2317391037940979 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00013710869859524143, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13224288.0, + "repeat_count": 0.0, + "routers_loss": 0.0016836341237649322, + "skip_count": 0.0, + "step": 8200, + "text_loss": 0.31873467564582825 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.507191077194015, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03955078125, + "learning_rate": 0.00013689584425698376, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13227342.0, + "repeat_count": 0.0, + "routers_loss": 0.002255793660879135, + "skip_count": 0.0, + "step": 8202, + "text_loss": 0.13513202965259552 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03173828125, + "learning_rate": 0.0001366831290609235, + "loss": 0.0067, + "macro_f1": 1.0, + "num_tokens": 13230912.0, + "repeat_count": 1.0, + "routers_loss": 0.0062925987876951694, + "skip_count": 4.0, + "step": 8204, + "text_loss": 0.3692396581172943 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00013647055308857353, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13233961.0, + "repeat_count": 1.0, + "routers_loss": 0.0020471401512622833, + "skip_count": 0.0, + "step": 8206, + "text_loss": 0.5655510425567627 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.0001362581164213934, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13237170.0, + "repeat_count": 0.0, + "routers_loss": 0.0009666495025157928, + "skip_count": 0.0, + "step": 8208, + "text_loss": 0.720582902431488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00013604581914078922, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13241020.0, + "repeat_count": 0.0, + "routers_loss": 0.0006306356517598033, + "skip_count": 0.0, + "step": 8210, + "text_loss": 0.5686481595039368 + }, + { + "acc_repeat": 0.5, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.55415321397123, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.6666666865348816, + "f1_skip": 0.0, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00013583366132811374, + "loss": 0.0058, + "macro_f1": 0.5492662787437439, + "num_tokens": 13244491.0, + "repeat_count": 2.0, + "routers_loss": 0.016230134293437004, + "skip_count": 0.0, + "step": 8212, + "text_loss": 0.55678790807724 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00013562164306466624, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13247551.0, + "repeat_count": 0.0, + "routers_loss": 0.003904943587258458, + "skip_count": 2.0, + "step": 8214, + "text_loss": 0.6521575450897217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.57293806868213, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00013540976443169244, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13250863.0, + "repeat_count": 0.0, + "routers_loss": 0.002239734400063753, + "skip_count": 1.0, + "step": 8216, + "text_loss": 0.29757481813430786 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00013519802551038452, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13254215.0, + "repeat_count": 0.0, + "routers_loss": 0.004978829529136419, + "skip_count": 2.0, + "step": 8218, + "text_loss": 0.30598193407058716 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00013498642638188157, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13257269.0, + "repeat_count": 0.0, + "routers_loss": 0.0040260558016598225, + "skip_count": 0.0, + "step": 8220, + "text_loss": 0.39327144622802734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021484375, + "learning_rate": 0.00013477496712726862, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13260573.0, + "repeat_count": 0.0, + "routers_loss": 0.002124674618244171, + "skip_count": 0.0, + "step": 8222, + "text_loss": 0.38342708349227905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00013456364782757718, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13263684.0, + "repeat_count": 0.0, + "routers_loss": 0.00087209593039006, + "skip_count": 0.0, + "step": 8224, + "text_loss": 0.6338301301002502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00013435246856378526, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13266879.0, + "repeat_count": 1.0, + "routers_loss": 0.003183641703799367, + "skip_count": 0.0, + "step": 8226, + "text_loss": 0.6073583364486694 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0169677734375, + "learning_rate": 0.00013414142941681718, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 13270679.0, + "repeat_count": 0.0, + "routers_loss": 0.001859338372014463, + "skip_count": 0.0, + "step": 8228, + "text_loss": 0.5427029132843018 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.0001339305304675435, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13273275.0, + "repeat_count": 0.0, + "routers_loss": 0.000655558833386749, + "skip_count": 0.0, + "step": 8230, + "text_loss": 0.29442915320396423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 0.00013371977179678113, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13276205.0, + "repeat_count": 0.0, + "routers_loss": 0.0011499621905386448, + "skip_count": 0.0, + "step": 8232, + "text_loss": 0.5601125359535217 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00013350915348529313, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13279242.0, + "repeat_count": 0.0, + "routers_loss": 0.0019823790062218904, + "skip_count": 0.0, + "step": 8234, + "text_loss": 0.43674135208129883 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.04248046875, + "learning_rate": 0.00013329867561378888, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 13282531.0, + "repeat_count": 0.0, + "routers_loss": 0.005772443953901529, + "skip_count": 3.0, + "step": 8236, + "text_loss": 0.4838809072971344 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033935546875, + "learning_rate": 0.00013308833826292395, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13286219.0, + "repeat_count": 0.0, + "routers_loss": 0.0038314659614115953, + "skip_count": 2.0, + "step": 8238, + "text_loss": 0.5002569556236267 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.75, + "avg_layers": 26.0, + "epoch": 38.685647196947464, + "f1_execute": 0.978723406791687, + "f1_repeat": 1.0, + "f1_skip": 0.8571428656578064, + "grad_norm": 0.031005859375, + "learning_rate": 0.00013287814151329987, + "loss": 0.0075, + "macro_f1": 0.9452888369560242, + "num_tokens": 13290348.0, + "repeat_count": 1.0, + "routers_loss": 0.04819172993302345, + "skip_count": 4.0, + "step": 8240, + "text_loss": 0.3099883198738098 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00013266808544546438, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13293644.0, + "repeat_count": 0.0, + "routers_loss": 0.010334883816540241, + "skip_count": 2.0, + "step": 8242, + "text_loss": 0.17672912776470184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00013245817013991164, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 13296721.0, + "repeat_count": 0.0, + "routers_loss": 0.00162201386410743, + "skip_count": 0.0, + "step": 8244, + "text_loss": 0.7664286494255066 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00013224839567708142, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13299704.0, + "repeat_count": 0.0, + "routers_loss": 0.0039452011696994305, + "skip_count": 0.0, + "step": 8246, + "text_loss": 0.1827820986509323 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 38.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.028564453125, + "learning_rate": 0.00013203876213735972, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13302553.0, + "repeat_count": 1.0, + "routers_loss": 0.006701917387545109, + "skip_count": 7.0, + "step": 8248, + "text_loss": 0.6020278930664062 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.73260933372468, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0001318292696010785, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13305875.0, + "repeat_count": 0.0, + "routers_loss": 0.00968079548329115, + "skip_count": 2.0, + "step": 8250, + "text_loss": 0.2693248987197876 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 38.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00013161991814851571, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 13309115.0, + "repeat_count": 2.0, + "routers_loss": 0.008890608325600624, + "skip_count": 2.0, + "step": 8252, + "text_loss": 0.6325297355651855 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.01953125, + "learning_rate": 0.00013141070785989517, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13312219.0, + "repeat_count": 1.0, + "routers_loss": 0.00825794693082571, + "skip_count": 4.0, + "step": 8254, + "text_loss": 0.284396767616272 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00013120163881538677, + "loss": 0.0095, + "macro_f1": 0.6666666865348816, + "num_tokens": 13315214.0, + "repeat_count": 0.0, + "routers_loss": 0.003378969384357333, + "skip_count": 1.0, + "step": 8256, + "text_loss": 0.20296992361545563 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.77017904314646, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00013099271109510603, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13319117.0, + "repeat_count": 1.0, + "routers_loss": 0.0164186954498291, + "skip_count": 0.0, + "step": 8258, + "text_loss": 0.21940068900585175 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037841796875, + "learning_rate": 0.0001307839247791145, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13321631.0, + "repeat_count": 0.0, + "routers_loss": 0.0053979759104549885, + "skip_count": 3.0, + "step": 8260, + "text_loss": 0.19442199170589447 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 0.00013057527994741946, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13324759.0, + "repeat_count": 0.0, + "routers_loss": 0.0024567479267716408, + "skip_count": 0.0, + "step": 8262, + "text_loss": 0.5528824925422668 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001303667766799741, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13327554.0, + "repeat_count": 0.0, + "routers_loss": 0.002819873159751296, + "skip_count": 1.0, + "step": 8264, + "text_loss": 0.4418395757675171 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.00013015841505667703, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 13331838.0, + "repeat_count": 0.0, + "routers_loss": 0.0030280952341854572, + "skip_count": 1.0, + "step": 8266, + "text_loss": 0.5263079404830933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 38.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0185546875, + "learning_rate": 0.0001299501951573731, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13334968.0, + "repeat_count": 0.0, + "routers_loss": 0.001774887670762837, + "skip_count": 4.0, + "step": 8268, + "text_loss": 0.47985130548477173 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012974211706185247, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 13338052.0, + "repeat_count": 0.0, + "routers_loss": 0.007027842104434967, + "skip_count": 1.0, + "step": 8270, + "text_loss": 0.6588287949562073 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00012953418084985107, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13341653.0, + "repeat_count": 0.0, + "routers_loss": 0.0026854060124605894, + "skip_count": 1.0, + "step": 8272, + "text_loss": 0.43156498670578003 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.84531846199002, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012932638660105038, + "loss": 0.0082, + "macro_f1": 0.3333333432674408, + "num_tokens": 13345173.0, + "repeat_count": 0.0, + "routers_loss": 0.0033325920812785625, + "skip_count": 0.0, + "step": 8274, + "text_loss": 0.1679086685180664 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00012911873439507766, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13348635.0, + "repeat_count": 0.0, + "routers_loss": 0.0016183287370949984, + "skip_count": 0.0, + "step": 8276, + "text_loss": 0.5907418131828308 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03369140625, + "learning_rate": 0.00012891122431150549, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13351120.0, + "repeat_count": 0.0, + "routers_loss": 0.0049970983527600765, + "skip_count": 1.0, + "step": 8278, + "text_loss": 0.5437678694725037 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.873495744056356, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 0.00012870385642985222, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13353774.0, + "repeat_count": 0.0, + "routers_loss": 0.0027123154141008854, + "skip_count": 0.0, + "step": 8280, + "text_loss": 0.5742796659469604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00012849663082958158, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13358236.0, + "repeat_count": 0.0, + "routers_loss": 0.0062842960469424725, + "skip_count": 0.0, + "step": 8282, + "text_loss": 0.2340863049030304 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012828954759010265, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13360994.0, + "repeat_count": 0.0, + "routers_loss": 0.0006564505747519433, + "skip_count": 0.0, + "step": 8284, + "text_loss": 0.45432794094085693 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0001280826067907705, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13363665.0, + "repeat_count": 0.0, + "routers_loss": 0.001298630959354341, + "skip_count": 0.0, + "step": 8286, + "text_loss": 0.7439755201339722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00012787580851088493, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 13367412.0, + "repeat_count": 0.0, + "routers_loss": 0.00464112963527441, + "skip_count": 0.0, + "step": 8288, + "text_loss": 0.2854461669921875 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.92045788083358, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03125, + "learning_rate": 0.0001276691528296916, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13370745.0, + "repeat_count": 0.0, + "routers_loss": 0.0006090773968026042, + "skip_count": 0.0, + "step": 8290, + "text_loss": 0.6663011312484741 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.040283203125, + "learning_rate": 0.00012746263982638123, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13373396.0, + "repeat_count": 0.0, + "routers_loss": 0.0038922233507037163, + "skip_count": 0.0, + "step": 8292, + "text_loss": 0.3858443796634674 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012725626958009007, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13376172.0, + "repeat_count": 0.0, + "routers_loss": 0.0016941255889832973, + "skip_count": 0.0, + "step": 8294, + "text_loss": 0.4758119285106659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 38.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.0001270500421698994, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13379002.0, + "repeat_count": 1.0, + "routers_loss": 0.001703770598396659, + "skip_count": 0.0, + "step": 8296, + "text_loss": 0.7464606165885925 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 38.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00012684395767483626, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13382221.0, + "repeat_count": 0.0, + "routers_loss": 0.001474690856412053, + "skip_count": 1.0, + "step": 8298, + "text_loss": 0.37309199571609497 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 38.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00012663801617387245, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13385276.0, + "repeat_count": 0.0, + "routers_loss": 0.004561704583466053, + "skip_count": 3.0, + "step": 8300, + "text_loss": 0.43284836411476135 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 38.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 0.00012643221774592518, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13388321.0, + "repeat_count": 2.0, + "routers_loss": 0.005136100109666586, + "skip_count": 1.0, + "step": 8302, + "text_loss": 0.669730007648468 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 38.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 0.00012622656246985675, + "loss": 0.0101, + "macro_f1": 0.3333333432674408, + "num_tokens": 13391222.0, + "repeat_count": 0.0, + "routers_loss": 0.0028521555941551924, + "skip_count": 0.0, + "step": 8304, + "text_loss": 0.16773155331611633 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 38.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 0.00012602105042447471, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13395297.0, + "repeat_count": 0.0, + "routers_loss": 0.0033424890134483576, + "skip_count": 2.0, + "step": 8306, + "text_loss": 0.1650846153497696 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001258156816885316, + "loss": 0.0047, + "macro_f1": 0.3333333432674408, + "num_tokens": 13398482.0, + "repeat_count": 0.0, + "routers_loss": 0.0012481207959353924, + "skip_count": 0.0, + "step": 8308, + "text_loss": 0.37225499749183655 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00012561045634072515, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13402199.0, + "repeat_count": 0.0, + "routers_loss": 0.006243644282221794, + "skip_count": 3.0, + "step": 8310, + "text_loss": 0.16000206768512726 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00012540537445969807, + "loss": 0.0087, + "macro_f1": 0.6666666865348816, + "num_tokens": 13404950.0, + "repeat_count": 0.0, + "routers_loss": 0.004267443902790546, + "skip_count": 2.0, + "step": 8312, + "text_loss": 0.400174081325531 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00012520043612403815, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13407883.0, + "repeat_count": 0.0, + "routers_loss": 0.005013707559555769, + "skip_count": 2.0, + "step": 8314, + "text_loss": 0.1331731230020523 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00012499564141227798, + "loss": 0.0047, + "macro_f1": 0.6666666865348816, + "num_tokens": 13410563.0, + "repeat_count": 1.0, + "routers_loss": 0.00463570561259985, + "skip_count": 0.0, + "step": 8316, + "text_loss": 0.5098661184310913 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.052978515625, + "learning_rate": 0.0001247909904028956, + "loss": 0.0078, + "macro_f1": 1.0, + "num_tokens": 13413730.0, + "repeat_count": 1.0, + "routers_loss": 0.007066591177135706, + "skip_count": 1.0, + "step": 8318, + "text_loss": 0.8059925436973572 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.061050777810394, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012458648317431348, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13416425.0, + "repeat_count": 0.0, + "routers_loss": 0.004210594110190868, + "skip_count": 3.0, + "step": 8320, + "text_loss": 0.6559522151947021 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.0001243821198048992, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13419851.0, + "repeat_count": 1.0, + "routers_loss": 0.005613257177174091, + "skip_count": 2.0, + "step": 8322, + "text_loss": 0.2783811688423157 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012417790037296523, + "loss": 0.0081, + "macro_f1": 0.6666666865348816, + "num_tokens": 13422588.0, + "repeat_count": 0.0, + "routers_loss": 0.00233642989769578, + "skip_count": 1.0, + "step": 8324, + "text_loss": 0.7659147381782532 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00012397382495676874, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13425275.0, + "repeat_count": 0.0, + "routers_loss": 0.0013295465614646673, + "skip_count": 0.0, + "step": 8326, + "text_loss": 0.5693745017051697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0001237698936345119, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13428314.0, + "repeat_count": 1.0, + "routers_loss": 0.005712272133678198, + "skip_count": 1.0, + "step": 8328, + "text_loss": 0.8581340909004211 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00012356610648434153, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13431453.0, + "repeat_count": 0.0, + "routers_loss": 0.0015835616504773498, + "skip_count": 0.0, + "step": 8330, + "text_loss": 0.1395341008901596 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02880859375, + "learning_rate": 0.00012336246358434928, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13434566.0, + "repeat_count": 0.0, + "routers_loss": 0.0012973316479474306, + "skip_count": 0.0, + "step": 8332, + "text_loss": 0.7125005125999451 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.00012315896501257145, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13438056.0, + "repeat_count": 0.0, + "routers_loss": 0.0005822008824907243, + "skip_count": 0.0, + "step": 8334, + "text_loss": 0.7730510234832764 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00012295561084698915, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13441390.0, + "repeat_count": 0.0, + "routers_loss": 0.00547185679897666, + "skip_count": 1.0, + "step": 8336, + "text_loss": 0.3927873373031616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 0.000122752401165528, + "loss": 0.0022, + "macro_f1": 0.3333333432674408, + "num_tokens": 13443864.0, + "repeat_count": 0.0, + "routers_loss": 0.0011191967641934752, + "skip_count": 0.0, + "step": 8338, + "text_loss": 0.3996548354625702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00012254933604605828, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13447070.0, + "repeat_count": 0.0, + "routers_loss": 0.0005196621641516685, + "skip_count": 0.0, + "step": 8340, + "text_loss": 0.5597847104072571 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 0.00012234641556639508, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 13450522.0, + "repeat_count": 0.0, + "routers_loss": 0.003857341594994068, + "skip_count": 2.0, + "step": 8342, + "text_loss": 0.14400488138198853 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00012214363980429793, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 13453578.0, + "repeat_count": 1.0, + "routers_loss": 0.006664265412837267, + "skip_count": 3.0, + "step": 8344, + "text_loss": 0.27675092220306396 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0595703125, + "learning_rate": 0.00012194100883747078, + "loss": 0.0095, + "macro_f1": 0.3333333432674408, + "num_tokens": 13456480.0, + "repeat_count": 0.0, + "routers_loss": 0.003549816319718957, + "skip_count": 0.0, + "step": 8346, + "text_loss": 0.21776801347732544 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 0.00012173852274356217, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13459859.0, + "repeat_count": 1.0, + "routers_loss": 0.00446992926299572, + "skip_count": 3.0, + "step": 8348, + "text_loss": 0.1828736811876297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021728515625, + "learning_rate": 0.00012153618160016527, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13463104.0, + "repeat_count": 0.0, + "routers_loss": 0.0024826989974826574, + "skip_count": 1.0, + "step": 8350, + "text_loss": 0.15649555623531342 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0001213339854848175, + "loss": 0.005, + "macro_f1": 0.6666666865348816, + "num_tokens": 13467051.0, + "repeat_count": 0.0, + "routers_loss": 0.0021385846193879843, + "skip_count": 1.0, + "step": 8352, + "text_loss": 0.49281737208366394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0230712890625, + "learning_rate": 0.00012113193447500081, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 13470411.0, + "repeat_count": 0.0, + "routers_loss": 0.0014382716035470366, + "skip_count": 1.0, + "step": 8354, + "text_loss": 0.5984349846839905 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 0.00012093002864814151, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13474666.0, + "repeat_count": 0.0, + "routers_loss": 0.008536498062312603, + "skip_count": 1.0, + "step": 8356, + "text_loss": 0.2851131856441498 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.026123046875, + "learning_rate": 0.00012072826808161036, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13477754.0, + "repeat_count": 0.0, + "routers_loss": 0.0027286717668175697, + "skip_count": 0.0, + "step": 8358, + "text_loss": 0.5987376570701599 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039306640625, + "learning_rate": 0.0001205266528527223, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13481151.0, + "repeat_count": 0.0, + "routers_loss": 0.002780565759167075, + "skip_count": 1.0, + "step": 8360, + "text_loss": 0.1847199648618698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00012032518303873674, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13484050.0, + "repeat_count": 0.0, + "routers_loss": 0.0006186611135490239, + "skip_count": 0.0, + "step": 8362, + "text_loss": 0.6229772567749023 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 39.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 0.00012012385871685716, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13488551.0, + "repeat_count": 0.0, + "routers_loss": 0.00956071075052023, + "skip_count": 5.0, + "step": 8364, + "text_loss": 0.2810790538787842 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 0.00011992267996423162, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13491420.0, + "repeat_count": 0.0, + "routers_loss": 0.008410792797803879, + "skip_count": 2.0, + "step": 8366, + "text_loss": 0.20509617030620575 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0283203125, + "learning_rate": 0.00011972164685795212, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 13494736.0, + "repeat_count": 0.0, + "routers_loss": 0.00762166129425168, + "skip_count": 1.0, + "step": 8368, + "text_loss": 0.24739402532577515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.295861461696504, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011952075947505486, + "loss": 0.0051, + "macro_f1": 0.3272727429866791, + "num_tokens": 13498363.0, + "repeat_count": 0.0, + "routers_loss": 0.010674391873180866, + "skip_count": 1.0, + "step": 8370, + "text_loss": 0.31931644678115845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 39.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 0.0001193200178925204, + "loss": 0.0036, + "macro_f1": 1.0, + "num_tokens": 13501029.0, + "repeat_count": 2.0, + "routers_loss": 0.0041843741200864315, + "skip_count": 1.0, + "step": 8372, + "text_loss": 0.5103049278259277 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00011911942218727312, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13503854.0, + "repeat_count": 0.0, + "routers_loss": 0.0006344785797409713, + "skip_count": 0.0, + "step": 8374, + "text_loss": 0.4914432764053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0262451171875, + "learning_rate": 0.00011891897243618183, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13508316.0, + "repeat_count": 0.0, + "routers_loss": 0.0003527739318087697, + "skip_count": 0.0, + "step": 8376, + "text_loss": 0.5317551493644714 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00011871866871605913, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 13512603.0, + "repeat_count": 0.0, + "routers_loss": 0.001071247854270041, + "skip_count": 0.0, + "step": 8378, + "text_loss": 0.6693558096885681 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00011851851110366185, + "loss": 0.0065, + "macro_f1": 0.6666666865348816, + "num_tokens": 13515928.0, + "repeat_count": 0.0, + "routers_loss": 0.000924977008253336, + "skip_count": 1.0, + "step": 8380, + "text_loss": 0.8004939556121826 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0001183184996756908, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13518548.0, + "repeat_count": 0.0, + "routers_loss": 0.0017637151759117842, + "skip_count": 0.0, + "step": 8382, + "text_loss": 0.5012105107307434 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.00011811863450879063, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 13522155.0, + "repeat_count": 2.0, + "routers_loss": 0.0011129514314234257, + "skip_count": 0.0, + "step": 8384, + "text_loss": 0.3866073489189148 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 39.371000880540066, + "f1_execute": 0.9777777791023254, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00011791891567955009, + "loss": 0.0046, + "macro_f1": 0.8814815282821655, + "num_tokens": 13525352.0, + "repeat_count": 2.0, + "routers_loss": 0.042801812291145325, + "skip_count": 4.0, + "step": 8386, + "text_loss": 0.18817944824695587 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018798828125, + "learning_rate": 0.00011771934326450173, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13528537.0, + "repeat_count": 0.0, + "routers_loss": 0.0006869474309496582, + "skip_count": 0.0, + "step": 8388, + "text_loss": 0.6407818794250488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0224609375, + "learning_rate": 0.00011751991734012229, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 13531650.0, + "repeat_count": 0.0, + "routers_loss": 0.0008001072565093637, + "skip_count": 0.0, + "step": 8390, + "text_loss": 0.5149344205856323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036865234375, + "learning_rate": 0.00011732063798283204, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13535071.0, + "repeat_count": 0.0, + "routers_loss": 0.0006921148742549121, + "skip_count": 0.0, + "step": 8392, + "text_loss": 0.5906356573104858 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00011712150526899523, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 13537741.0, + "repeat_count": 0.0, + "routers_loss": 0.005221226718276739, + "skip_count": 2.0, + "step": 8394, + "text_loss": 0.3381146192550659 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 0.00011692251927491987, + "loss": 0.006, + "macro_f1": 1.0, + "num_tokens": 13541189.0, + "repeat_count": 1.0, + "routers_loss": 0.0023983579594641924, + "skip_count": 1.0, + "step": 8396, + "text_loss": 0.7345486283302307 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 0.00011672368007685774, + "loss": 0.0069, + "macro_f1": 1.0, + "num_tokens": 13545210.0, + "repeat_count": 1.0, + "routers_loss": 0.005362956319004297, + "skip_count": 2.0, + "step": 8398, + "text_loss": 0.6522865295410156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 0.00011652498775100445, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 13548260.0, + "repeat_count": 0.0, + "routers_loss": 0.002955642296001315, + "skip_count": 0.0, + "step": 8400, + "text_loss": 0.3200102150440216 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.00011632644237349927, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13551519.0, + "repeat_count": 0.0, + "routers_loss": 0.001079231034964323, + "skip_count": 0.0, + "step": 8402, + "text_loss": 0.7251807451248169 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 0.00011612804402042509, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13555241.0, + "repeat_count": 1.0, + "routers_loss": 0.013860360719263554, + "skip_count": 0.0, + "step": 8404, + "text_loss": 0.159539595246315 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.800000011920929, + "avg_layers": 25.0, + "epoch": 39.46492515409451, + "f1_execute": 0.9777777791023254, + "f1_repeat": 1.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.054931640625, + "learning_rate": 0.00011592979276780857, + "loss": 0.0055, + "macro_f1": 0.9555556178092957, + "num_tokens": 13558389.0, + "repeat_count": 1.0, + "routers_loss": 0.017025530338287354, + "skip_count": 5.0, + "step": 8406, + "text_loss": 0.5154430270195007 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011573168869162004, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 13561237.0, + "repeat_count": 1.0, + "routers_loss": 0.007349071092903614, + "skip_count": 2.0, + "step": 8408, + "text_loss": 0.20888492465019226 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 0.00011553373186777327, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13564080.0, + "repeat_count": 1.0, + "routers_loss": 0.003303215140476823, + "skip_count": 2.0, + "step": 8410, + "text_loss": 0.21808166801929474 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00011533592237212558, + "loss": 0.0035, + "macro_f1": 0.6666666865348816, + "num_tokens": 13566649.0, + "repeat_count": 0.0, + "routers_loss": 0.005856195464730263, + "skip_count": 1.0, + "step": 8412, + "text_loss": 0.28037169575691223 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001151382602804782, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13570015.0, + "repeat_count": 0.0, + "routers_loss": 0.0007515792385675013, + "skip_count": 0.0, + "step": 8414, + "text_loss": 0.8517835736274719 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00011494074566857549, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13573262.0, + "repeat_count": 0.0, + "routers_loss": 0.0043421462178230286, + "skip_count": 0.0, + "step": 8416, + "text_loss": 0.27418580651283264 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00011474337861210544, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13576104.0, + "repeat_count": 1.0, + "routers_loss": 0.0108594736084342, + "skip_count": 2.0, + "step": 8418, + "text_loss": 0.4724268317222595 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.53067214558262, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.028076171875, + "learning_rate": 0.00011454615918669948, + "loss": 0.008, + "macro_f1": 0.3272727429866791, + "num_tokens": 13579138.0, + "repeat_count": 1.0, + "routers_loss": 0.04178442806005478, + "skip_count": 0.0, + "step": 8420, + "text_loss": 0.4065103530883789 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.00011434908746793238, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13582818.0, + "repeat_count": 0.0, + "routers_loss": 0.004756448790431023, + "skip_count": 2.0, + "step": 8422, + "text_loss": 0.2932167947292328 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00011415216353132252, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13586261.0, + "repeat_count": 0.0, + "routers_loss": 0.0033427432645112276, + "skip_count": 1.0, + "step": 8424, + "text_loss": 0.47670233249664307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0001139553874523313, + "loss": 0.003, + "macro_f1": 0.6666666865348816, + "num_tokens": 13589765.0, + "repeat_count": 0.0, + "routers_loss": 0.006597383879125118, + "skip_count": 1.0, + "step": 8426, + "text_loss": 0.31448885798454285 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.5682418550044, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042724609375, + "learning_rate": 0.00011375875930636403, + "loss": 0.005, + "macro_f1": 0.3272727429866791, + "num_tokens": 13592741.0, + "repeat_count": 0.0, + "routers_loss": 0.011398134753108025, + "skip_count": 1.0, + "step": 8428, + "text_loss": 0.17429469525814056 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.051025390625, + "learning_rate": 0.00011356227916876877, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13595763.0, + "repeat_count": 1.0, + "routers_loss": 0.0038021153304725885, + "skip_count": 0.0, + "step": 8430, + "text_loss": 0.6043882966041565 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00011336594711483712, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13598274.0, + "repeat_count": 0.0, + "routers_loss": 0.00044314167462289333, + "skip_count": 0.0, + "step": 8432, + "text_loss": 0.3818575143814087 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00011316976321980388, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13601510.0, + "repeat_count": 0.0, + "routers_loss": 0.001956664025783539, + "skip_count": 0.0, + "step": 8434, + "text_loss": 0.48483794927597046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.0001129737275588471, + "loss": 0.0063, + "macro_f1": 0.3333333432674408, + "num_tokens": 13604410.0, + "repeat_count": 0.0, + "routers_loss": 0.005170237272977829, + "skip_count": 0.0, + "step": 8436, + "text_loss": 0.21759741008281708 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011277784020708803, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 13607207.0, + "repeat_count": 1.0, + "routers_loss": 0.002223948948085308, + "skip_count": 2.0, + "step": 8438, + "text_loss": 0.6877034306526184 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 0.00011258210123959089, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13610981.0, + "repeat_count": 0.0, + "routers_loss": 0.0017733481945469975, + "skip_count": 1.0, + "step": 8440, + "text_loss": 0.7250658273696899 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00011238651073136358, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 13614194.0, + "repeat_count": 1.0, + "routers_loss": 0.00155889883171767, + "skip_count": 1.0, + "step": 8442, + "text_loss": 0.6742649078369141 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 0.00011219106875735652, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13618011.0, + "repeat_count": 0.0, + "routers_loss": 0.0011234934208914638, + "skip_count": 0.0, + "step": 8444, + "text_loss": 0.8105526566505432 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.65277370120341, + "f1_execute": 0.9811320900917053, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00011199577539246347, + "loss": 0.0055, + "macro_f1": 0.6603773832321167, + "num_tokens": 13621852.0, + "repeat_count": 1.0, + "routers_loss": 0.02346695400774479, + "skip_count": 1.0, + "step": 8446, + "text_loss": 0.22664032876491547 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02294921875, + "learning_rate": 0.0001118006307115213, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 13624711.0, + "repeat_count": 0.0, + "routers_loss": 0.012819754891097546, + "skip_count": 2.0, + "step": 8448, + "text_loss": 0.31696105003356934 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 39.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00011160563478930969, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13627561.0, + "repeat_count": 0.0, + "routers_loss": 0.0060531035996973515, + "skip_count": 2.0, + "step": 8450, + "text_loss": 0.2935826778411865 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00011141078770055152, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13630445.0, + "repeat_count": 0.0, + "routers_loss": 0.004288572818040848, + "skip_count": 0.0, + "step": 8452, + "text_loss": 0.5720692873001099 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00011121608951991252, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 13633496.0, + "repeat_count": 0.0, + "routers_loss": 0.005682424642145634, + "skip_count": 1.0, + "step": 8454, + "text_loss": 0.28466710448265076 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00011102154032200146, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13635938.0, + "repeat_count": 0.0, + "routers_loss": 0.0009555552969686687, + "skip_count": 0.0, + "step": 8456, + "text_loss": 0.47744694352149963 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00011082714018136985, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13638863.0, + "repeat_count": 0.0, + "routers_loss": 0.0023627313785254955, + "skip_count": 0.0, + "step": 8458, + "text_loss": 0.5212090611457825 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00011063288917251235, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13641874.0, + "repeat_count": 1.0, + "routers_loss": 0.00791920255869627, + "skip_count": 2.0, + "step": 8460, + "text_loss": 0.31359919905662537 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.72791312004696, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00011043878736986607, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13644970.0, + "repeat_count": 1.0, + "routers_loss": 0.0033252311404794455, + "skip_count": 1.0, + "step": 8462, + "text_loss": 0.33621230721473694 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.00011024483484781144, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13648103.0, + "repeat_count": 1.0, + "routers_loss": 0.005567418877035379, + "skip_count": 2.0, + "step": 8464, + "text_loss": 0.48708856105804443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0235595703125, + "learning_rate": 0.00011005103168067143, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 13651085.0, + "repeat_count": 0.0, + "routers_loss": 0.00047958645154722035, + "skip_count": 0.0, + "step": 8466, + "text_loss": 0.4151248633861542 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00010985737794271161, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13654175.0, + "repeat_count": 0.0, + "routers_loss": 0.0009806647431105375, + "skip_count": 0.0, + "step": 8468, + "text_loss": 0.7322396039962769 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 0.00010966387370814057, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13657058.0, + "repeat_count": 0.0, + "routers_loss": 0.0009820344857871532, + "skip_count": 0.0, + "step": 8470, + "text_loss": 0.6350769400596619 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 39.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010947051905110945, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13660203.0, + "repeat_count": 2.0, + "routers_loss": 0.002065197564661503, + "skip_count": 0.0, + "step": 8472, + "text_loss": 0.6025850176811218 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.78426768417963, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010927731404571211, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13664021.0, + "repeat_count": 0.0, + "routers_loss": 0.0009939799783751369, + "skip_count": 0.0, + "step": 8474, + "text_loss": 0.3040087819099426 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0234375, + "learning_rate": 0.0001090842587659851, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13667055.0, + "repeat_count": 0.0, + "routers_loss": 0.0008282510680146515, + "skip_count": 0.0, + "step": 8476, + "text_loss": 0.7306531667709351 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001088913532859076, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13669940.0, + "repeat_count": 0.0, + "routers_loss": 0.0008349589770659804, + "skip_count": 0.0, + "step": 8478, + "text_loss": 0.32041916251182556 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010869859767940133, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13672955.0, + "repeat_count": 0.0, + "routers_loss": 0.0007435405277647078, + "skip_count": 0.0, + "step": 8480, + "text_loss": 0.5343614816665649 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03076171875, + "learning_rate": 0.00010850599202033051, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 13676173.0, + "repeat_count": 0.0, + "routers_loss": 0.002763360273092985, + "skip_count": 0.0, + "step": 8482, + "text_loss": 0.6071668267250061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.034423828125, + "learning_rate": 0.00010831353638250213, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 13680121.0, + "repeat_count": 0.0, + "routers_loss": 0.00202178000472486, + "skip_count": 0.0, + "step": 8484, + "text_loss": 0.42487844824790955 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010812123083966535, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13683504.0, + "repeat_count": 0.0, + "routers_loss": 0.0056348275393247604, + "skip_count": 1.0, + "step": 8486, + "text_loss": 0.17678795754909515 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010792907546551229, + "loss": 0.0079, + "macro_f1": 0.3333333432674408, + "num_tokens": 13686870.0, + "repeat_count": 0.0, + "routers_loss": 0.003331703832373023, + "skip_count": 0.0, + "step": 8488, + "text_loss": 0.32238465547561646 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 0.00010773707033367708, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13690429.0, + "repeat_count": 0.0, + "routers_loss": 0.0011620528530329466, + "skip_count": 0.0, + "step": 8490, + "text_loss": 0.4141998291015625 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 39.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.041015625, + "learning_rate": 0.00010754521551773655, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 13693747.0, + "repeat_count": 1.0, + "routers_loss": 0.005236583761870861, + "skip_count": 0.0, + "step": 8492, + "text_loss": 0.557283878326416 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 39.878191957734074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 0.00010735351109120972, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 13696837.0, + "repeat_count": 0.0, + "routers_loss": 0.005507425405085087, + "skip_count": 6.0, + "step": 8494, + "text_loss": 0.7394861578941345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 0.00010716195712755821, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13700080.0, + "repeat_count": 0.0, + "routers_loss": 0.0008621517335996032, + "skip_count": 0.0, + "step": 8496, + "text_loss": 0.7079368233680725 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 0.00010697055370018572, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13704088.0, + "repeat_count": 0.0, + "routers_loss": 0.0004489862476475537, + "skip_count": 0.0, + "step": 8498, + "text_loss": 0.5672308206558228 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 0.00010677930088243847, + "loss": 0.0077, + "macro_f1": 1.0, + "num_tokens": 13707391.0, + "repeat_count": 1.0, + "routers_loss": 0.009171495214104652, + "skip_count": 2.0, + "step": 8500, + "text_loss": 0.6851600408554077 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029541015625, + "learning_rate": 0.00010658819874760495, + "loss": 0.0058, + "macro_f1": 0.6666666865348816, + "num_tokens": 13711238.0, + "repeat_count": 0.0, + "routers_loss": 0.0016714727971702814, + "skip_count": 1.0, + "step": 8502, + "text_loss": 0.7102733850479126 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00010639724736891576, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13714553.0, + "repeat_count": 0.0, + "routers_loss": 0.0012916292762383819, + "skip_count": 0.0, + "step": 8504, + "text_loss": 0.4234752953052521 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 0.0001062064468195439, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13718046.0, + "repeat_count": 0.0, + "routers_loss": 0.0005265420186333358, + "skip_count": 0.0, + "step": 8506, + "text_loss": 0.5576326251029968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0218505859375, + "learning_rate": 0.0001060157971726045, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13720687.0, + "repeat_count": 0.0, + "routers_loss": 0.0023503501433879137, + "skip_count": 1.0, + "step": 8508, + "text_loss": 0.5259605646133423 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 0.00010582529850115469, + "loss": 0.0066, + "macro_f1": 0.3333333432674408, + "num_tokens": 13723946.0, + "repeat_count": 0.0, + "routers_loss": 0.0007593657355755568, + "skip_count": 0.0, + "step": 8510, + "text_loss": 0.3795129954814911 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.05419921875, + "learning_rate": 0.00010563495087819419, + "loss": 0.0077, + "macro_f1": 0.3333333432674408, + "num_tokens": 13727589.0, + "repeat_count": 0.0, + "routers_loss": 0.0005672222469002008, + "skip_count": 0.0, + "step": 8512, + "text_loss": 0.685897946357727 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 39.972116231288524, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.029296875, + "learning_rate": 0.00010544475437666445, + "loss": 0.0049, + "macro_f1": 0.9262410998344421, + "num_tokens": 13730579.0, + "repeat_count": 3.0, + "routers_loss": 0.01708158478140831, + "skip_count": 2.0, + "step": 8514, + "text_loss": 0.8044925332069397 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 39.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0517578125, + "learning_rate": 0.00010525470906944917, + "loss": 0.0113, + "macro_f1": 1.0, + "num_tokens": 13733563.0, + "repeat_count": 1.0, + "routers_loss": 0.010253295302391052, + "skip_count": 2.0, + "step": 8516, + "text_loss": 0.3999447524547577 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 39.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0233154296875, + "learning_rate": 0.00010506481502937398, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13736645.0, + "repeat_count": 0.0, + "routers_loss": 0.004293019883334637, + "skip_count": 0.0, + "step": 8518, + "text_loss": 0.3128681778907776 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 0.00010487507232920674, + "loss": 0.0039, + "macro_f1": 1.0, + "num_tokens": 13740080.0, + "repeat_count": 1.0, + "routers_loss": 0.0030790462624281645, + "skip_count": 1.0, + "step": 8520, + "text_loss": 0.39142900705337524 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 0.00010468548104165709, + "loss": 0.0069, + "macro_f1": 0.3333333432674408, + "num_tokens": 13743085.0, + "repeat_count": 0.0, + "routers_loss": 0.0007342757890000939, + "skip_count": 0.0, + "step": 8522, + "text_loss": 0.7652465105056763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 0.00010449604123937689, + "loss": 0.006, + "macro_f1": 0.3333333432674408, + "num_tokens": 13746513.0, + "repeat_count": 0.0, + "routers_loss": 0.0030496022664010525, + "skip_count": 0.0, + "step": 8524, + "text_loss": 0.6259746551513672 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0206298828125, + "learning_rate": 0.00010430675299495973, + "loss": 0.0044, + "macro_f1": 1.0, + "num_tokens": 13749391.0, + "repeat_count": 1.0, + "routers_loss": 0.010060965083539486, + "skip_count": 1.0, + "step": 8526, + "text_loss": 0.2266668826341629 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 0.0001041176163809413, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 13752449.0, + "repeat_count": 1.0, + "routers_loss": 0.002234962536022067, + "skip_count": 2.0, + "step": 8528, + "text_loss": 0.9742465019226074 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00010392863146979903, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13755572.0, + "repeat_count": 0.0, + "routers_loss": 0.0003572004789020866, + "skip_count": 0.0, + "step": 8530, + "text_loss": 0.5757357478141785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010373979833395242, + "loss": 0.0088, + "macro_f1": 0.3333333432674408, + "num_tokens": 13759198.0, + "repeat_count": 0.0, + "routers_loss": 0.011161680333316326, + "skip_count": 0.0, + "step": 8532, + "text_loss": 0.6268131136894226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 0.00010355111704576236, + "loss": 0.004, + "macro_f1": 0.3333333432674408, + "num_tokens": 13761914.0, + "repeat_count": 0.0, + "routers_loss": 0.002053353004157543, + "skip_count": 0.0, + "step": 8534, + "text_loss": 0.22388778626918793 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00010336258767753232, + "loss": 0.0061, + "macro_f1": 0.6666666865348816, + "num_tokens": 13765371.0, + "repeat_count": 0.0, + "routers_loss": 0.003634720342233777, + "skip_count": 2.0, + "step": 8536, + "text_loss": 0.5802993178367615 + }, + { + "acc_repeat": 0.800000011920929, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.084531846199, + "f1_execute": 0.9729729890823364, + "f1_repeat": 0.888888955116272, + "f1_skip": 1.0, + "grad_norm": 0.031494140625, + "learning_rate": 0.00010317421030150692, + "loss": 0.0072, + "macro_f1": 0.9539539813995361, + "num_tokens": 13768276.0, + "repeat_count": 5.0, + "routers_loss": 0.053806692361831665, + "skip_count": 5.0, + "step": 8538, + "text_loss": 0.10888377577066422 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.09392427355445, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.07275390625, + "learning_rate": 0.00010298598498987266, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13772369.0, + "repeat_count": 0.0, + "routers_loss": 0.00501362606883049, + "skip_count": 1.0, + "step": 8540, + "text_loss": 0.5794995427131653 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.024658203125, + "learning_rate": 0.00010279791181475795, + "loss": 0.0082, + "macro_f1": 1.0, + "num_tokens": 13776595.0, + "repeat_count": 1.0, + "routers_loss": 0.002230882178992033, + "skip_count": 2.0, + "step": 8542, + "text_loss": 0.5503702163696289 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.035400390625, + "learning_rate": 0.00010260999084823264, + "loss": 0.0061, + "macro_f1": 0.3333333432674408, + "num_tokens": 13779993.0, + "repeat_count": 0.0, + "routers_loss": 0.0012205395614728332, + "skip_count": 0.0, + "step": 8544, + "text_loss": 0.7248672842979431 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 0.00010242222216230856, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13782683.0, + "repeat_count": 0.0, + "routers_loss": 0.0003966465883422643, + "skip_count": 0.0, + "step": 8546, + "text_loss": 0.7446619272232056 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 0.00010223460582893889, + "loss": 0.0036, + "macro_f1": 0.6666666865348816, + "num_tokens": 13785534.0, + "repeat_count": 0.0, + "routers_loss": 0.004968565888702869, + "skip_count": 1.0, + "step": 8548, + "text_loss": 0.22457796335220337 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020263671875, + "learning_rate": 0.00010204714192001863, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13788608.0, + "repeat_count": 0.0, + "routers_loss": 0.0033054195810109377, + "skip_count": 2.0, + "step": 8550, + "text_loss": 0.418837308883667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 0.00010185983050738434, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 13791553.0, + "repeat_count": 0.0, + "routers_loss": 0.001166256028227508, + "skip_count": 0.0, + "step": 8552, + "text_loss": 0.4060337543487549 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00010167267166281402, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13795304.0, + "repeat_count": 0.0, + "routers_loss": 0.003844029037281871, + "skip_count": 2.0, + "step": 8554, + "text_loss": 0.17412975430488586 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023681640625, + "learning_rate": 0.00010148566545802718, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13798445.0, + "repeat_count": 0.0, + "routers_loss": 0.0033507589250802994, + "skip_count": 0.0, + "step": 8556, + "text_loss": 0.24744336307048798 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.17845611975345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02783203125, + "learning_rate": 0.00010129881196468527, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13801338.0, + "repeat_count": 0.0, + "routers_loss": 0.004076482728123665, + "skip_count": 0.0, + "step": 8558, + "text_loss": 0.6542767882347107 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01806640625, + "learning_rate": 0.00010111211125439069, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 13804157.0, + "repeat_count": 0.0, + "routers_loss": 0.0005654391716234386, + "skip_count": 0.0, + "step": 8560, + "text_loss": 0.527079701423645 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 0.00010092556339868758, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13807411.0, + "repeat_count": 0.0, + "routers_loss": 0.004915264435112476, + "skip_count": 1.0, + "step": 8562, + "text_loss": 0.721017599105835 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00010073916846906139, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13810489.0, + "repeat_count": 0.0, + "routers_loss": 0.005571382585912943, + "skip_count": 1.0, + "step": 8564, + "text_loss": 0.5802517533302307 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02587890625, + "learning_rate": 0.00010055292653693903, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 13813526.0, + "repeat_count": 0.0, + "routers_loss": 0.001321605988778174, + "skip_count": 0.0, + "step": 8566, + "text_loss": 0.5485247373580933 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.041259765625, + "learning_rate": 0.00010036683767368859, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 13817225.0, + "repeat_count": 0.0, + "routers_loss": 0.001876185997389257, + "skip_count": 0.0, + "step": 8568, + "text_loss": 0.08957820385694504 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00010018090195061997, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13820667.0, + "repeat_count": 0.0, + "routers_loss": 0.004593426361680031, + "skip_count": 0.0, + "step": 8570, + "text_loss": 0.24580086767673492 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.999511943898398e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 13824505.0, + "repeat_count": 0.0, + "routers_loss": 0.0022372701205313206, + "skip_count": 0.0, + "step": 8572, + "text_loss": 0.20976831018924713 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.980949020997276e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13827623.0, + "repeat_count": 0.0, + "routers_loss": 0.0030519715510308743, + "skip_count": 0.0, + "step": 8574, + "text_loss": 0.7638732194900513 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.26298796595245, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.027587890625, + "learning_rate": 9.962401433471985e-05, + "loss": 0.0068, + "macro_f1": 0.6666666865348816, + "num_tokens": 13831013.0, + "repeat_count": 0.0, + "routers_loss": 0.005036211106926203, + "skip_count": 1.0, + "step": 8576, + "text_loss": 0.3791790306568146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.2723803933079, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.943869188429989e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 13833611.0, + "repeat_count": 0.0, + "routers_loss": 0.002071794355288148, + "skip_count": 2.0, + "step": 8578, + "text_loss": 0.5480846166610718 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.28177282066334, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.024658203125, + "learning_rate": 9.925352292972884e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 13836678.0, + "repeat_count": 1.0, + "routers_loss": 0.008119060657918453, + "skip_count": 0.0, + "step": 8580, + "text_loss": 0.21605457365512848 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.291165248018785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 9.906850754196379e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13839255.0, + "repeat_count": 0.0, + "routers_loss": 0.004017427563667297, + "skip_count": 2.0, + "step": 8582, + "text_loss": 0.4473285973072052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30055767537423, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.045654296875, + "learning_rate": 9.888364579190285e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13842034.0, + "repeat_count": 0.0, + "routers_loss": 0.005163116846233606, + "skip_count": 1.0, + "step": 8584, + "text_loss": 0.21627424657344818 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.30995010272967, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.869893775038557e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 13844648.0, + "repeat_count": 0.0, + "routers_loss": 0.0044358340092003345, + "skip_count": 1.0, + "step": 8586, + "text_loss": 0.5660704970359802 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.319342530085116, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.851438348819247e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13847629.0, + "repeat_count": 0.0, + "routers_loss": 0.00038135924842208624, + "skip_count": 1.0, + "step": 8588, + "text_loss": 0.6401235461235046 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.32873495744057, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.028076171875, + "learning_rate": 9.832998307604495e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13851409.0, + "repeat_count": 0.0, + "routers_loss": 0.004005341790616512, + "skip_count": 1.0, + "step": 8590, + "text_loss": 0.43975043296813965 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.33812738479601, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0296630859375, + "learning_rate": 9.814573658460562e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 13854031.0, + "repeat_count": 0.0, + "routers_loss": 0.006872966885566711, + "skip_count": 2.0, + "step": 8592, + "text_loss": 0.6000451445579529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.347519812151454, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.796164408447811e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 13856813.0, + "repeat_count": 0.0, + "routers_loss": 0.0019872859120368958, + "skip_count": 0.0, + "step": 8594, + "text_loss": 0.6026073098182678 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.3569122395069, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0228271484375, + "learning_rate": 9.777770564620698e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13859805.0, + "repeat_count": 0.0, + "routers_loss": 0.013098123483359814, + "skip_count": 2.0, + "step": 8596, + "text_loss": 0.3294500708580017 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 40.36630466686234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 9.759392134027783e-05, + "loss": 0.0049, + "macro_f1": 1.0, + "num_tokens": 13863119.0, + "repeat_count": 1.0, + "routers_loss": 0.001011171261779964, + "skip_count": 1.0, + "step": 8598, + "text_loss": 0.4078965187072754 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.375697094217784, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0252685546875, + "learning_rate": 9.741029123711708e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 13866239.0, + "repeat_count": 0.0, + "routers_loss": 0.003267963184043765, + "skip_count": 0.0, + "step": 8600, + "text_loss": 0.5064641833305359 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.385089521573235, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 9.722681540709228e-05, + "loss": 0.0045, + "macro_f1": 0.6601307392120361, + "num_tokens": 13869647.0, + "repeat_count": 1.0, + "routers_loss": 0.02431299351155758, + "skip_count": 2.0, + "step": 8602, + "text_loss": 0.2512950301170349 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.39448194892868, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0294189453125, + "learning_rate": 9.704349392051155e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13873128.0, + "repeat_count": 0.0, + "routers_loss": 0.0019577480852603912, + "skip_count": 1.0, + "step": 8604, + "text_loss": 0.425156831741333 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.40387437628412, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0201416015625, + "learning_rate": 9.686032684762408e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 13876603.0, + "repeat_count": 0.0, + "routers_loss": 0.001554530463181436, + "skip_count": 1.0, + "step": 8606, + "text_loss": 0.3596082329750061 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.413266803639566, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01519775390625, + "learning_rate": 9.667731425861975e-05, + "loss": 0.0048, + "macro_f1": 0.3333333432674408, + "num_tokens": 13879602.0, + "repeat_count": 0.0, + "routers_loss": 0.0027400986291468143, + "skip_count": 0.0, + "step": 8608, + "text_loss": 0.12101534754037857 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.42265923099501, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 9.649445622362957e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 13882204.0, + "repeat_count": 0.0, + "routers_loss": 0.001957559958100319, + "skip_count": 2.0, + "step": 8610, + "text_loss": 0.382834255695343 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.43205165835045, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.025146484375, + "learning_rate": 9.631175281272491e-05, + "loss": 0.0059, + "macro_f1": 1.0, + "num_tokens": 13886397.0, + "repeat_count": 1.0, + "routers_loss": 0.009613300673663616, + "skip_count": 3.0, + "step": 8612, + "text_loss": 0.24718235433101654 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.441444085705896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 9.612920409591813e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13889625.0, + "repeat_count": 0.0, + "routers_loss": 0.0015159029280766845, + "skip_count": 0.0, + "step": 8614, + "text_loss": 0.406452476978302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.45083651306135, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.59468101431622e-05, + "loss": 0.0034, + "macro_f1": 0.6666666865348816, + "num_tokens": 13892518.0, + "repeat_count": 0.0, + "routers_loss": 0.008069832809269428, + "skip_count": 3.0, + "step": 8616, + "text_loss": 0.19740329682826996 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.46022894041679, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0157470703125, + "learning_rate": 9.576457102435082e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 13895822.0, + "repeat_count": 0.0, + "routers_loss": 0.0024340536911040545, + "skip_count": 0.0, + "step": 8618, + "text_loss": 0.44761306047439575 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.469621367772234, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02880859375, + "learning_rate": 9.558248680931841e-05, + "loss": 0.0052, + "macro_f1": 1.0, + "num_tokens": 13898829.0, + "repeat_count": 2.0, + "routers_loss": 0.0053517078049480915, + "skip_count": 1.0, + "step": 8620, + "text_loss": 0.37335118651390076 + }, + { + "acc_repeat": 0.6666666865348816, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.47901379512768, + "f1_execute": 0.9767441749572754, + "f1_repeat": 0.800000011920929, + "f1_skip": 1.0, + "grad_norm": 0.021484375, + "learning_rate": 9.540055756783994e-05, + "loss": 0.0061, + "macro_f1": 0.9255813956260681, + "num_tokens": 13902122.0, + "repeat_count": 3.0, + "routers_loss": 0.03885587304830551, + "skip_count": 4.0, + "step": 8622, + "text_loss": 0.21311092376708984 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.48840622248312, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.051025390625, + "learning_rate": 9.521878336963108e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 13904874.0, + "repeat_count": 0.0, + "routers_loss": 0.007965708151459694, + "skip_count": 1.0, + "step": 8624, + "text_loss": 0.27229398488998413 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.497798649838565, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020263671875, + "learning_rate": 9.5037164284348e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 13907755.0, + "repeat_count": 0.0, + "routers_loss": 0.0019825168419629335, + "skip_count": 0.0, + "step": 8626, + "text_loss": 0.6535577178001404 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.507191077194015, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.042236328125, + "learning_rate": 9.485570038158747e-05, + "loss": 0.0085, + "macro_f1": 0.3272727429866791, + "num_tokens": 13910619.0, + "repeat_count": 1.0, + "routers_loss": 0.017803344875574112, + "skip_count": 0.0, + "step": 8628, + "text_loss": 0.26617178320884705 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.51658350454946, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.467439173088687e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 13914098.0, + "repeat_count": 0.0, + "routers_loss": 0.0025836096610873938, + "skip_count": 0.0, + "step": 8630, + "text_loss": 0.44465285539627075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.5259759319049, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.44932384017238e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13917192.0, + "repeat_count": 0.0, + "routers_loss": 0.004438584204763174, + "skip_count": 2.0, + "step": 8632, + "text_loss": 0.33622798323631287 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.535368359260346, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0277099609375, + "learning_rate": 9.431224046351688e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 13920067.0, + "repeat_count": 0.0, + "routers_loss": 0.017312567681074142, + "skip_count": 2.0, + "step": 8634, + "text_loss": 0.31870952248573303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.54476078661579, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0255126953125, + "learning_rate": 9.413139798562476e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 13922887.0, + "repeat_count": 0.0, + "routers_loss": 0.0019389945082366467, + "skip_count": 0.0, + "step": 8636, + "text_loss": 0.18223261833190918 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.55415321397123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 9.395071103734648e-05, + "loss": 0.0075, + "macro_f1": 0.3333333432674408, + "num_tokens": 13926545.0, + "repeat_count": 0.0, + "routers_loss": 0.0011485094437375665, + "skip_count": 0.0, + "step": 8638, + "text_loss": 0.48031774163246155 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.563545641326684, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0184326171875, + "learning_rate": 9.377017968792179e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 13931171.0, + "repeat_count": 1.0, + "routers_loss": 0.003448521951213479, + "skip_count": 0.0, + "step": 8640, + "text_loss": 0.7585139870643616 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 40.57293806868213, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.0213623046875, + "learning_rate": 9.35898040065305e-05, + "loss": 0.0048, + "macro_f1": 0.5492662787437439, + "num_tokens": 13934369.0, + "repeat_count": 0.0, + "routers_loss": 0.017959754914045334, + "skip_count": 2.0, + "step": 8642, + "text_loss": 0.49708613753318787 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.58233049603757, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.018310546875, + "learning_rate": 9.3409584062293e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 13938166.0, + "repeat_count": 0.0, + "routers_loss": 0.004092653747648001, + "skip_count": 1.0, + "step": 8644, + "text_loss": 0.20662656426429749 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.591722923393014, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 9.322951992426992e-05, + "loss": 0.0043, + "macro_f1": 0.3333333432674408, + "num_tokens": 13941922.0, + "repeat_count": 0.0, + "routers_loss": 0.0026206092443317175, + "skip_count": 0.0, + "step": 8646, + "text_loss": 0.4735889434814453 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 40.60111535074846, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 9.304961166146209e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 13945569.0, + "repeat_count": 3.0, + "routers_loss": 0.005156307481229305, + "skip_count": 2.0, + "step": 8648, + "text_loss": 0.5630270838737488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.6105077781039, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02685546875, + "learning_rate": 9.286985934281079e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 13948357.0, + "repeat_count": 0.0, + "routers_loss": 0.004913610871881247, + "skip_count": 1.0, + "step": 8650, + "text_loss": 0.4053497016429901 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.619900205459345, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0186767578125, + "learning_rate": 9.26902630371974e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 13952543.0, + "repeat_count": 0.0, + "routers_loss": 0.003946282435208559, + "skip_count": 2.0, + "step": 8652, + "text_loss": 0.40166863799095154 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.629292632814796, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.251082281344358e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 13955917.0, + "repeat_count": 0.0, + "routers_loss": 0.0009605551022104919, + "skip_count": 0.0, + "step": 8654, + "text_loss": 0.20477983355522156 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.63868506017024, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.233153874031102e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 13960071.0, + "repeat_count": 0.0, + "routers_loss": 0.004408199340105057, + "skip_count": 3.0, + "step": 8656, + "text_loss": 0.3349814713001251 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.64807748752568, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0208740234375, + "learning_rate": 9.215241088650194e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 13963125.0, + "repeat_count": 1.0, + "routers_loss": 0.005541396792978048, + "skip_count": 2.0, + "step": 8658, + "text_loss": 0.6602919697761536 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.657469914881126, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 9.197343932065843e-05, + "loss": 0.0058, + "macro_f1": 0.3333333432674408, + "num_tokens": 13966130.0, + "repeat_count": 0.0, + "routers_loss": 0.001636760076507926, + "skip_count": 0.0, + "step": 8660, + "text_loss": 0.7704628109931946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.66686234223657, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031494140625, + "learning_rate": 9.179462411136263e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 13969791.0, + "repeat_count": 0.0, + "routers_loss": 0.0006453761598095298, + "skip_count": 0.0, + "step": 8662, + "text_loss": 0.3898075520992279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.67625476959201, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 9.161596532713695e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 13972987.0, + "repeat_count": 0.0, + "routers_loss": 0.005081792362034321, + "skip_count": 4.0, + "step": 8664, + "text_loss": 0.8477506041526794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.685647196947464, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0269775390625, + "learning_rate": 9.143746303644374e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 13976505.0, + "repeat_count": 0.0, + "routers_loss": 0.0032063762191683054, + "skip_count": 0.0, + "step": 8666, + "text_loss": 0.23729658126831055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.69503962430291, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0274658203125, + "learning_rate": 9.125911730768543e-05, + "loss": 0.0067, + "macro_f1": 0.3333333432674408, + "num_tokens": 13980061.0, + "repeat_count": 0.0, + "routers_loss": 0.00043821477447636425, + "skip_count": 0.0, + "step": 8668, + "text_loss": 0.4233637750148773 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.70443205165835, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0260009765625, + "learning_rate": 9.108092820920438e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 13983407.0, + "repeat_count": 0.0, + "routers_loss": 0.007779054809361696, + "skip_count": 2.0, + "step": 8670, + "text_loss": 0.5050316452980042 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.713824479013795, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03662109375, + "learning_rate": 9.090289580928307e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 13986725.0, + "repeat_count": 0.0, + "routers_loss": 0.0018697676714509726, + "skip_count": 1.0, + "step": 8672, + "text_loss": 1.0568488836288452 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.72321690636924, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0302734375, + "learning_rate": 9.072502017614382e-05, + "loss": 0.0053, + "macro_f1": 0.3333333432674408, + "num_tokens": 13990765.0, + "repeat_count": 0.0, + "routers_loss": 0.002077789744362235, + "skip_count": 0.0, + "step": 8674, + "text_loss": 0.48911142349243164 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.73260933372468, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0419921875, + "learning_rate": 9.054730137794887e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 13994083.0, + "repeat_count": 1.0, + "routers_loss": 0.044373031705617905, + "skip_count": 3.0, + "step": 8676, + "text_loss": 0.3420281708240509 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.74200176108013, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 9.036973948280048e-05, + "loss": 0.007, + "macro_f1": 0.3333333432674408, + "num_tokens": 13997500.0, + "repeat_count": 0.0, + "routers_loss": 0.0015431724023073912, + "skip_count": 0.0, + "step": 8678, + "text_loss": 0.21514096856117249 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.751394188435576, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 9.019233455874049e-05, + "loss": 0.0066, + "macro_f1": 0.6666666865348816, + "num_tokens": 14000460.0, + "repeat_count": 0.0, + "routers_loss": 0.006088062655180693, + "skip_count": 1.0, + "step": 8680, + "text_loss": 0.43932875990867615 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.76078661579102, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.026611328125, + "learning_rate": 9.001508667375107e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14003537.0, + "repeat_count": 2.0, + "routers_loss": 0.01006145216524601, + "skip_count": 3.0, + "step": 8682, + "text_loss": 0.2192728966474533 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.77017904314646, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029541015625, + "learning_rate": 8.983799589575393e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14005943.0, + "repeat_count": 0.0, + "routers_loss": 0.001044525415636599, + "skip_count": 0.0, + "step": 8684, + "text_loss": 0.8686383962631226 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.77957147050191, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.96610622926104e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14008954.0, + "repeat_count": 0.0, + "routers_loss": 0.004876079503446817, + "skip_count": 2.0, + "step": 8686, + "text_loss": 0.2513524889945984 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.78896389785735, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.948428593212193e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14012268.0, + "repeat_count": 1.0, + "routers_loss": 0.007909095846116543, + "skip_count": 2.0, + "step": 8688, + "text_loss": 0.17117907106876373 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.798356325212794, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0277099609375, + "learning_rate": 8.930766688202946e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14015192.0, + "repeat_count": 0.0, + "routers_loss": 0.0022194553166627884, + "skip_count": 0.0, + "step": 8690, + "text_loss": 0.637697160243988 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.807748752568244, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0162353515625, + "learning_rate": 8.913120521001383e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14018055.0, + "repeat_count": 1.0, + "routers_loss": 0.0023777696769684553, + "skip_count": 0.0, + "step": 8692, + "text_loss": 0.39099860191345215 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.81714117992369, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 8.895490098369535e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14021035.0, + "repeat_count": 0.0, + "routers_loss": 0.002676652278751135, + "skip_count": 1.0, + "step": 8694, + "text_loss": 0.6112156510353088 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.82653360727913, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0230712890625, + "learning_rate": 8.877875427063431e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14023759.0, + "repeat_count": 0.0, + "routers_loss": 0.001040685223415494, + "skip_count": 0.0, + "step": 8696, + "text_loss": 0.3562681972980499 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 40.835926034634575, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0341796875, + "learning_rate": 8.86027651383302e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14026090.0, + "repeat_count": 1.0, + "routers_loss": 0.0011444527190178633, + "skip_count": 0.0, + "step": 8698, + "text_loss": 0.6152632236480713 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 40.84531846199002, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.04345703125, + "learning_rate": 8.842693365422266e-05, + "loss": 0.008, + "macro_f1": 0.8817967176437378, + "num_tokens": 14029570.0, + "repeat_count": 2.0, + "routers_loss": 0.024327632039785385, + "skip_count": 3.0, + "step": 8700, + "text_loss": 0.2170596867799759 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.85471088934546, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.825125988569061e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14032418.0, + "repeat_count": 0.0, + "routers_loss": 0.00048010432510636747, + "skip_count": 0.0, + "step": 8702, + "text_loss": 0.4421340525150299 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.86410331670091, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033203125, + "learning_rate": 8.807574390005241e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14035610.0, + "repeat_count": 0.0, + "routers_loss": 0.0010498231276869774, + "skip_count": 0.0, + "step": 8704, + "text_loss": 0.3656717538833618 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.873495744056356, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0272216796875, + "learning_rate": 8.790038576456627e-05, + "loss": 0.0045, + "macro_f1": 0.3272727429866791, + "num_tokens": 14039354.0, + "repeat_count": 0.0, + "routers_loss": 0.019302964210510254, + "skip_count": 1.0, + "step": 8706, + "text_loss": 0.6150856018066406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.8828881714118, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.023193359375, + "learning_rate": 8.772518554642972e-05, + "loss": 0.0029, + "macro_f1": 0.3333333432674408, + "num_tokens": 14042353.0, + "repeat_count": 0.0, + "routers_loss": 0.004211598541587591, + "skip_count": 0.0, + "step": 8708, + "text_loss": 0.17178772389888763 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.89228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.022705078125, + "learning_rate": 8.755014331277972e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14045704.0, + "repeat_count": 0.0, + "routers_loss": 0.0007902922225184739, + "skip_count": 0.0, + "step": 8710, + "text_loss": 0.6289885640144348 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.90167302612269, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03271484375, + "learning_rate": 8.737525913069277e-05, + "loss": 0.0062, + "macro_f1": 1.0, + "num_tokens": 14048743.0, + "repeat_count": 1.0, + "routers_loss": 0.007915202528238297, + "skip_count": 2.0, + "step": 8712, + "text_loss": 0.2778690457344055 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.91106545347813, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0380859375, + "learning_rate": 8.720053306718506e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14052762.0, + "repeat_count": 0.0, + "routers_loss": 0.0027877227403223515, + "skip_count": 3.0, + "step": 8714, + "text_loss": 0.3615926504135132 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.92045788083358, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0478515625, + "learning_rate": 8.702596518921175e-05, + "loss": 0.0086, + "macro_f1": 0.6603773832321167, + "num_tokens": 14056645.0, + "repeat_count": 1.0, + "routers_loss": 0.03460995852947235, + "skip_count": 1.0, + "step": 8716, + "text_loss": 0.19412031769752502 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.929850308189025, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02001953125, + "learning_rate": 8.685155556366763e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14059604.0, + "repeat_count": 1.0, + "routers_loss": 0.0026834046002477407, + "skip_count": 2.0, + "step": 8718, + "text_loss": 0.4414670169353485 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 40.93924273554447, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.667730425738679e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14062170.0, + "repeat_count": 0.0, + "routers_loss": 0.01547359861433506, + "skip_count": 4.0, + "step": 8720, + "text_loss": 0.2850716710090637 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.94863516289991, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02978515625, + "learning_rate": 8.650321133714267e-05, + "loss": 0.0074, + "macro_f1": 0.3333333432674408, + "num_tokens": 14065526.0, + "repeat_count": 0.0, + "routers_loss": 0.0020194994285702705, + "skip_count": 0.0, + "step": 8722, + "text_loss": 0.1776508241891861 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.958027590255355, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0225830078125, + "learning_rate": 8.632927686964798e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14068525.0, + "repeat_count": 0.0, + "routers_loss": 0.0037195945624262094, + "skip_count": 0.0, + "step": 8724, + "text_loss": 0.2786005735397339 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 40.9674200176108, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0220947265625, + "learning_rate": 8.615550092155477e-05, + "loss": 0.0058, + "macro_f1": 1.0, + "num_tokens": 14071830.0, + "repeat_count": 1.0, + "routers_loss": 0.008169961161911488, + "skip_count": 4.0, + "step": 8726, + "text_loss": 0.43228310346603394 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.97681244496625, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02587890625, + "learning_rate": 8.598188355945424e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14074977.0, + "repeat_count": 0.0, + "routers_loss": 0.006407112814486027, + "skip_count": 1.0, + "step": 8728, + "text_loss": 0.24443474411964417 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 40.98620487232169, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0498046875, + "learning_rate": 8.580842484987689e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14078104.0, + "repeat_count": 0.0, + "routers_loss": 0.001878641895018518, + "skip_count": 1.0, + "step": 8730, + "text_loss": 0.4559098184108734 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 40.99559729967714, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.033447265625, + "learning_rate": 8.563512485929253e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14081934.0, + "repeat_count": 0.0, + "routers_loss": 0.0056114462204277515, + "skip_count": 0.0, + "step": 8732, + "text_loss": 0.3063429594039917 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.004696213677725, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 8.546198365411007e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14085097.0, + "repeat_count": 1.0, + "routers_loss": 0.001542840269394219, + "skip_count": 0.0, + "step": 8734, + "text_loss": 0.7624274492263794 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.01408864103317, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 8.528900130067741e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14088630.0, + "repeat_count": 0.0, + "routers_loss": 0.002677374053746462, + "skip_count": 0.0, + "step": 8736, + "text_loss": 0.18395234644412994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.02348106838861, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 8.511617786528175e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14091513.0, + "repeat_count": 1.0, + "routers_loss": 0.004059800878167152, + "skip_count": 0.0, + "step": 8738, + "text_loss": 0.4567817449569702 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.032873495744056, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.040771484375, + "learning_rate": 8.494351341414947e-05, + "loss": 0.0066, + "macro_f1": 1.0, + "num_tokens": 14094500.0, + "repeat_count": 1.0, + "routers_loss": 0.0023724427446722984, + "skip_count": 1.0, + "step": 8740, + "text_loss": 0.6925744414329529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.0422659230995, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0155029296875, + "learning_rate": 8.477100801344573e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14097518.0, + "repeat_count": 0.0, + "routers_loss": 0.0013842503540217876, + "skip_count": 2.0, + "step": 8742, + "text_loss": 0.6574832201004028 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.05165835045494, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 8.459866172927505e-05, + "loss": 0.0073, + "macro_f1": 0.6666666865348816, + "num_tokens": 14101219.0, + "repeat_count": 0.0, + "routers_loss": 0.003597316099330783, + "skip_count": 2.0, + "step": 8744, + "text_loss": 0.785912036895752 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.800000011920929, + "avg_layers": 24.0, + "epoch": 41.061050777810394, + "f1_execute": 0.978723406791687, + "f1_repeat": 0.0, + "f1_skip": 0.888888955116272, + "grad_norm": 0.027099609375, + "learning_rate": 8.442647462768082e-05, + "loss": 0.0066, + "macro_f1": 0.6225374937057495, + "num_tokens": 14104460.0, + "repeat_count": 0.0, + "routers_loss": 0.01929798349738121, + "skip_count": 5.0, + "step": 8746, + "text_loss": 0.2111714482307434 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.07044320516584, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 8.425444677464545e-05, + "loss": 0.005, + "macro_f1": 0.3333333432674408, + "num_tokens": 14107404.0, + "repeat_count": 0.0, + "routers_loss": 0.00048497592797502875, + "skip_count": 0.0, + "step": 8748, + "text_loss": 0.4764930307865143 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.07983563252128, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 8.408257823609033e-05, + "loss": 0.0048, + "macro_f1": 1.0, + "num_tokens": 14109917.0, + "repeat_count": 1.0, + "routers_loss": 0.007886217907071114, + "skip_count": 2.0, + "step": 8750, + "text_loss": 0.2771969735622406 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.089228059876724, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0201416015625, + "learning_rate": 8.391086907787587e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14112649.0, + "repeat_count": 0.0, + "routers_loss": 0.006535434629768133, + "skip_count": 0.0, + "step": 8752, + "text_loss": 0.1550854742527008 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.09862048723217, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 8.373931936580114e-05, + "loss": 0.0062, + "macro_f1": 0.3333333432674408, + "num_tokens": 14116044.0, + "repeat_count": 0.0, + "routers_loss": 0.002130605047568679, + "skip_count": 0.0, + "step": 8754, + "text_loss": 0.4055478870868683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.10801291458761, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02197265625, + "learning_rate": 8.356792916560457e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14119097.0, + "repeat_count": 0.0, + "routers_loss": 0.0005611231899820268, + "skip_count": 0.0, + "step": 8756, + "text_loss": 0.47804903984069824 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.117405341943055, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0283203125, + "learning_rate": 8.339669854296316e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14122079.0, + "repeat_count": 2.0, + "routers_loss": 0.005650801584124565, + "skip_count": 0.0, + "step": 8758, + "text_loss": 0.1968296617269516 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.126797769298506, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 8.322562756349273e-05, + "loss": 0.0048, + "macro_f1": 0.6666666865348816, + "num_tokens": 14124910.0, + "repeat_count": 0.0, + "routers_loss": 0.0035948604345321655, + "skip_count": 1.0, + "step": 8760, + "text_loss": 0.4988253712654114 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.13619019665395, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03515625, + "learning_rate": 8.305471629274802e-05, + "loss": 0.0078, + "macro_f1": 0.3333333432674408, + "num_tokens": 14127767.0, + "repeat_count": 0.0, + "routers_loss": 0.0012090947711840272, + "skip_count": 0.0, + "step": 8762, + "text_loss": 0.6330704689025879 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.14558262400939, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.019287109375, + "learning_rate": 8.288396479622262e-05, + "loss": 0.0041, + "macro_f1": 0.6666666865348816, + "num_tokens": 14130766.0, + "repeat_count": 0.0, + "routers_loss": 0.0010853242129087448, + "skip_count": 1.0, + "step": 8764, + "text_loss": 0.43057000637054443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.154975051364836, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 8.271337313934868e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14133804.0, + "repeat_count": 0.0, + "routers_loss": 0.0037055034190416336, + "skip_count": 2.0, + "step": 8766, + "text_loss": 0.31973564624786377 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.16436747872028, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03662109375, + "learning_rate": 8.254294138749741e-05, + "loss": 0.0045, + "macro_f1": 0.3333333432674408, + "num_tokens": 14137164.0, + "repeat_count": 0.0, + "routers_loss": 0.005338407587260008, + "skip_count": 0.0, + "step": 8768, + "text_loss": 0.5066531896591187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.17375990607572, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.020751953125, + "learning_rate": 8.237266960597844e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14140119.0, + "repeat_count": 0.0, + "routers_loss": 0.0014707009540870786, + "skip_count": 1.0, + "step": 8770, + "text_loss": 0.553493857383728 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.183152333431174, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0223388671875, + "learning_rate": 8.220255786004033e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14143223.0, + "repeat_count": 0.0, + "routers_loss": 0.002113121096044779, + "skip_count": 0.0, + "step": 8772, + "text_loss": 0.40016281604766846 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.19254476078662, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0179443359375, + "learning_rate": 8.203260621487019e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14146366.0, + "repeat_count": 0.0, + "routers_loss": 0.002210963051766157, + "skip_count": 1.0, + "step": 8774, + "text_loss": 0.44022905826568604 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.20193718814206, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0264892578125, + "learning_rate": 8.186281473559382e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14150009.0, + "repeat_count": 0.0, + "routers_loss": 0.0011857844656333327, + "skip_count": 0.0, + "step": 8776, + "text_loss": 0.572823703289032 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.211329615497505, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02734375, + "learning_rate": 8.169318348727544e-05, + "loss": 0.0042, + "macro_f1": 0.6666666865348816, + "num_tokens": 14153343.0, + "repeat_count": 0.0, + "routers_loss": 0.0020397785119712353, + "skip_count": 1.0, + "step": 8778, + "text_loss": 0.5724276900291443 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.22072204285295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 8.152371253491841e-05, + "loss": 0.0044, + "macro_f1": 0.3333333432674408, + "num_tokens": 14156392.0, + "repeat_count": 0.0, + "routers_loss": 0.001745635992847383, + "skip_count": 0.0, + "step": 8780, + "text_loss": 0.14162923395633698 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.23011447020839, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.038818359375, + "learning_rate": 8.135440194346416e-05, + "loss": 0.0035, + "macro_f1": 0.3333333432674408, + "num_tokens": 14159616.0, + "repeat_count": 0.0, + "routers_loss": 0.002799858106300235, + "skip_count": 0.0, + "step": 8782, + "text_loss": 0.18205340206623077 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.23950689756384, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0250244140625, + "learning_rate": 8.118525177779284e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14163531.0, + "repeat_count": 1.0, + "routers_loss": 0.0029223538003861904, + "skip_count": 0.0, + "step": 8784, + "text_loss": 0.4107058644294739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.248899324919286, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.01904296875, + "learning_rate": 8.101626210272311e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14166776.0, + "repeat_count": 0.0, + "routers_loss": 0.001209643087349832, + "skip_count": 0.0, + "step": 8786, + "text_loss": 0.6441596746444702 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.25829175227473, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.036376953125, + "learning_rate": 8.084743298301211e-05, + "loss": 0.0059, + "macro_f1": 0.3333333432674408, + "num_tokens": 14169586.0, + "repeat_count": 0.0, + "routers_loss": 0.0015196573222056031, + "skip_count": 0.0, + "step": 8788, + "text_loss": 0.35585930943489075 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.26768417963017, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02685546875, + "learning_rate": 8.067876448335549e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14174180.0, + "repeat_count": 0.0, + "routers_loss": 0.0004388966190163046, + "skip_count": 0.0, + "step": 8790, + "text_loss": 0.31594613194465637 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.27707660698562, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.044189453125, + "learning_rate": 8.05102566683873e-05, + "loss": 0.008, + "macro_f1": 0.6666666865348816, + "num_tokens": 14177950.0, + "repeat_count": 1.0, + "routers_loss": 0.0031201441306620836, + "skip_count": 0.0, + "step": 8792, + "text_loss": 0.3161006569862366 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.28646903434106, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.031982421875, + "learning_rate": 8.034190960268012e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14180642.0, + "repeat_count": 0.0, + "routers_loss": 0.001848527928814292, + "skip_count": 0.0, + "step": 8794, + "text_loss": 0.47571417689323425 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.295861461696504, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.025634765625, + "learning_rate": 8.017372335074486e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14183743.0, + "repeat_count": 0.0, + "routers_loss": 0.0043064444325864315, + "skip_count": 1.0, + "step": 8796, + "text_loss": 0.5976942777633667 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.305253889051954, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030029296875, + "learning_rate": 8.000569797703072e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14187742.0, + "repeat_count": 0.0, + "routers_loss": 0.005383181851357222, + "skip_count": 2.0, + "step": 8798, + "text_loss": 0.2692606449127197 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3146463164074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0206298828125, + "learning_rate": 7.983783354592544e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14191211.0, + "repeat_count": 0.0, + "routers_loss": 0.001401974936015904, + "skip_count": 0.0, + "step": 8800, + "text_loss": 0.38108205795288086 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.32403874376284, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02490234375, + "learning_rate": 7.967013012175478e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14194992.0, + "repeat_count": 0.0, + "routers_loss": 0.001168998540379107, + "skip_count": 0.0, + "step": 8802, + "text_loss": 0.5201764106750488 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.333431171118285, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.05322265625, + "learning_rate": 7.950258776878332e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14198059.0, + "repeat_count": 0.0, + "routers_loss": 0.0032015808392316103, + "skip_count": 2.0, + "step": 8804, + "text_loss": 0.6014752984046936 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.34282359847373, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025390625, + "learning_rate": 7.933520655121351e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14202313.0, + "repeat_count": 0.0, + "routers_loss": 0.0009403078584000468, + "skip_count": 0.0, + "step": 8806, + "text_loss": 0.54194176197052 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.35221602582917, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.916798653318607e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14205534.0, + "repeat_count": 0.0, + "routers_loss": 0.0027781077660620213, + "skip_count": 1.0, + "step": 8808, + "text_loss": 0.7181227803230286 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.36160845318462, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0203857421875, + "learning_rate": 7.900092777878004e-05, + "loss": 0.0049, + "macro_f1": 0.6666666865348816, + "num_tokens": 14209357.0, + "repeat_count": 0.0, + "routers_loss": 0.0034586815163493156, + "skip_count": 1.0, + "step": 8810, + "text_loss": 0.21651209890842438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.371000880540066, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.883403035201265e-05, + "loss": 0.0056, + "macro_f1": 1.0, + "num_tokens": 14212328.0, + "repeat_count": 1.0, + "routers_loss": 0.01194343063980341, + "skip_count": 4.0, + "step": 8812, + "text_loss": 0.20523512363433838 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.38039330789551, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0157470703125, + "learning_rate": 7.866729431683938e-05, + "loss": 0.0038, + "macro_f1": 1.0, + "num_tokens": 14214979.0, + "repeat_count": 1.0, + "routers_loss": 0.0045132869854569435, + "skip_count": 1.0, + "step": 8814, + "text_loss": 0.4066837728023529 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.38978573525095, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0181884765625, + "learning_rate": 7.850071973715368e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14219030.0, + "repeat_count": 0.0, + "routers_loss": 0.005109346006065607, + "skip_count": 2.0, + "step": 8816, + "text_loss": 0.12459450960159302 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.3991781626064, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0220947265625, + "learning_rate": 7.833430667678737e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14222117.0, + "repeat_count": 0.0, + "routers_loss": 0.0036401136312633753, + "skip_count": 0.0, + "step": 8818, + "text_loss": 0.3759046494960785 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.40857058996184, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0240478515625, + "learning_rate": 7.816805519951008e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14225546.0, + "repeat_count": 2.0, + "routers_loss": 0.006177824921905994, + "skip_count": 1.0, + "step": 8820, + "text_loss": 0.4031941592693329 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 23.0, + "epoch": 41.41796301731729, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032470703125, + "learning_rate": 7.800196536902987e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14228731.0, + "repeat_count": 0.0, + "routers_loss": 0.009549650363624096, + "skip_count": 5.0, + "step": 8822, + "text_loss": 0.2895966172218323 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.427355444672735, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.030517578125, + "learning_rate": 7.783603724899258e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14231796.0, + "repeat_count": 0.0, + "routers_loss": 0.005532847251743078, + "skip_count": 2.0, + "step": 8824, + "text_loss": 0.32433390617370605 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.43674787202818, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.032958984375, + "learning_rate": 7.767027090298206e-05, + "loss": 0.0051, + "macro_f1": 0.3333333432674408, + "num_tokens": 14235869.0, + "repeat_count": 0.0, + "routers_loss": 0.0011165215400978923, + "skip_count": 0.0, + "step": 8826, + "text_loss": 0.41239091753959656 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.44614029938362, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.750466639452059e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14238830.0, + "repeat_count": 0.0, + "routers_loss": 0.0007845646468922496, + "skip_count": 0.0, + "step": 8828, + "text_loss": 0.5113243460655212 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.455532726739065, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0267333984375, + "learning_rate": 7.733922378706787e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14241672.0, + "repeat_count": 0.0, + "routers_loss": 0.0029602700378745794, + "skip_count": 1.0, + "step": 8830, + "text_loss": 0.22004501521587372 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 29.0, + "epoch": 41.46492515409451, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 7.717394314402199e-05, + "loss": 0.0037, + "macro_f1": 1.0, + "num_tokens": 14244522.0, + "repeat_count": 2.0, + "routers_loss": 0.005297200754284859, + "skip_count": 1.0, + "step": 8832, + "text_loss": 0.6039504408836365 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.47431758144996, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.041015625, + "learning_rate": 7.700882452871872e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14246964.0, + "repeat_count": 0.0, + "routers_loss": 0.0018059068825095892, + "skip_count": 2.0, + "step": 8834, + "text_loss": 0.46563026309013367 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.4837100088054, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0341796875, + "learning_rate": 7.684386800443177e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14249387.0, + "repeat_count": 0.0, + "routers_loss": 0.005659483838826418, + "skip_count": 2.0, + "step": 8836, + "text_loss": 0.31516948342323303 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.49310243616085, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033447265625, + "learning_rate": 7.667907363437288e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14252438.0, + "repeat_count": 0.0, + "routers_loss": 0.011170750483870506, + "skip_count": 1.0, + "step": 8838, + "text_loss": 0.22867503762245178 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.50249486351629, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0279541015625, + "learning_rate": 7.651444148169157e-05, + "loss": 0.0064, + "macro_f1": 0.6666666865348816, + "num_tokens": 14255490.0, + "repeat_count": 0.0, + "routers_loss": 0.004106760956346989, + "skip_count": 2.0, + "step": 8840, + "text_loss": 0.5757828950881958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.511887290871734, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0390625, + "learning_rate": 7.634997160947499e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14258430.0, + "repeat_count": 0.0, + "routers_loss": 0.0008562540751881897, + "skip_count": 0.0, + "step": 8842, + "text_loss": 0.5166661143302917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.52127971822718, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0198974609375, + "learning_rate": 7.618566408074862e-05, + "loss": 0.0036, + "macro_f1": 0.3333333432674408, + "num_tokens": 14261275.0, + "repeat_count": 0.0, + "routers_loss": 0.0012901517329737544, + "skip_count": 0.0, + "step": 8844, + "text_loss": 0.7376981973648071 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.53067214558262, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0303955078125, + "learning_rate": 7.602151895847526e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14264698.0, + "repeat_count": 0.0, + "routers_loss": 0.00267209205776453, + "skip_count": 0.0, + "step": 8846, + "text_loss": 0.5249470472335815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 22.0, + "epoch": 41.54006457293807, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03955078125, + "learning_rate": 7.585753630555565e-05, + "loss": 0.009, + "macro_f1": 1.0, + "num_tokens": 14267887.0, + "repeat_count": 1.0, + "routers_loss": 0.015334542840719223, + "skip_count": 7.0, + "step": 8848, + "text_loss": 1.1539889574050903 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.549457000293515, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.017578125, + "learning_rate": 7.569371618482818e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14271392.0, + "repeat_count": 0.0, + "routers_loss": 0.0010222389828413725, + "skip_count": 0.0, + "step": 8850, + "text_loss": 0.33968010544776917 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.55884942764896, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0322265625, + "learning_rate": 7.553005865906914e-05, + "loss": 0.0064, + "macro_f1": 0.3333333432674408, + "num_tokens": 14274658.0, + "repeat_count": 0.0, + "routers_loss": 0.0006116362637840211, + "skip_count": 0.0, + "step": 8852, + "text_loss": 0.7514221668243408 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.5682418550044, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.025634765625, + "learning_rate": 7.536656379099221e-05, + "loss": 0.0041, + "macro_f1": 0.3333333432674408, + "num_tokens": 14277763.0, + "repeat_count": 0.0, + "routers_loss": 0.0036474792286753654, + "skip_count": 0.0, + "step": 8854, + "text_loss": 0.3964846134185791 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.577634282359845, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0225830078125, + "learning_rate": 7.520323164324921e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14281165.0, + "repeat_count": 0.0, + "routers_loss": 0.005498840939253569, + "skip_count": 1.0, + "step": 8856, + "text_loss": 0.2235594391822815 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 30.0, + "epoch": 41.58702670971529, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0211181640625, + "learning_rate": 7.504006227842919e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14284761.0, + "repeat_count": 2.0, + "routers_loss": 0.006513409782201052, + "skip_count": 0.0, + "step": 8858, + "text_loss": 0.45196816325187683 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.59641913707074, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.48770557590589e-05, + "loss": 0.0071, + "macro_f1": 0.3333333432674408, + "num_tokens": 14287844.0, + "repeat_count": 0.0, + "routers_loss": 0.0013065916718915105, + "skip_count": 0.0, + "step": 8860, + "text_loss": 0.2188033014535904 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.60581156442618, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.031005859375, + "learning_rate": 7.471421214760287e-05, + "loss": 0.0083, + "macro_f1": 0.6666666865348816, + "num_tokens": 14291280.0, + "repeat_count": 1.0, + "routers_loss": 0.0016644994029775262, + "skip_count": 0.0, + "step": 8862, + "text_loss": 0.7049906253814697 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.61520399178163, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0263671875, + "learning_rate": 7.455153150646299e-05, + "loss": 0.0051, + "macro_f1": 0.6666666865348816, + "num_tokens": 14294330.0, + "repeat_count": 1.0, + "routers_loss": 0.002664943691343069, + "skip_count": 0.0, + "step": 8864, + "text_loss": 0.2160239815711975 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.62459641913707, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.02001953125, + "learning_rate": 7.43890138979788e-05, + "loss": 0.0039, + "macro_f1": 0.6666666865348816, + "num_tokens": 14298355.0, + "repeat_count": 1.0, + "routers_loss": 0.0035776710137724876, + "skip_count": 0.0, + "step": 8866, + "text_loss": 0.4922088384628296 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.633988846492514, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.422665938442741e-05, + "loss": 0.0033, + "macro_f1": 0.6666666865348816, + "num_tokens": 14301452.0, + "repeat_count": 0.0, + "routers_loss": 0.0029914912302047014, + "skip_count": 2.0, + "step": 8868, + "text_loss": 0.5828475952148438 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.64338127384796, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.029052734375, + "learning_rate": 7.406446802802331e-05, + "loss": 0.0045, + "macro_f1": 1.0, + "num_tokens": 14304667.0, + "repeat_count": 1.0, + "routers_loss": 0.0010031569981947541, + "skip_count": 2.0, + "step": 8870, + "text_loss": 0.657244861125946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.65277370120341, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0274658203125, + "learning_rate": 7.390243989091849e-05, + "loss": 0.0046, + "macro_f1": 0.6666666865348816, + "num_tokens": 14307397.0, + "repeat_count": 0.0, + "routers_loss": 0.007960405200719833, + "skip_count": 1.0, + "step": 8872, + "text_loss": 0.3147352635860443 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.66216612855885, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0244140625, + "learning_rate": 7.37405750352026e-05, + "loss": 0.004, + "macro_f1": 1.0, + "num_tokens": 14310687.0, + "repeat_count": 1.0, + "routers_loss": 0.007953251712024212, + "skip_count": 3.0, + "step": 8874, + "text_loss": 0.30315887928009033 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.671558555914295, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.018310546875, + "learning_rate": 7.357887352290227e-05, + "loss": 0.0033, + "macro_f1": 0.3333333432674408, + "num_tokens": 14314007.0, + "repeat_count": 0.0, + "routers_loss": 0.0012103051412850618, + "skip_count": 0.0, + "step": 8876, + "text_loss": 0.6356115341186523 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.68095098326974, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.341733541598217e-05, + "loss": 0.0063, + "macro_f1": 0.6666666865348816, + "num_tokens": 14316696.0, + "repeat_count": 0.0, + "routers_loss": 0.0017898730002343655, + "skip_count": 1.0, + "step": 8878, + "text_loss": 0.35877764225006104 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.69034341062518, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.047119140625, + "learning_rate": 7.325596077634383e-05, + "loss": 0.0068, + "macro_f1": 0.3333333432674408, + "num_tokens": 14320172.0, + "repeat_count": 0.0, + "routers_loss": 0.0007144945557229221, + "skip_count": 0.0, + "step": 8880, + "text_loss": 0.7939266562461853 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.699735837980626, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.039306640625, + "learning_rate": 7.309474966582635e-05, + "loss": 0.0052, + "macro_f1": 0.3333333432674408, + "num_tokens": 14323262.0, + "repeat_count": 0.0, + "routers_loss": 0.001255290349945426, + "skip_count": 0.0, + "step": 8882, + "text_loss": 0.7115976810455322 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.70912826533607, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 7.293370214620616e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14326826.0, + "repeat_count": 0.0, + "routers_loss": 0.0028131126891821623, + "skip_count": 2.0, + "step": 8884, + "text_loss": 0.24073036015033722 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.71852069269152, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0264892578125, + "learning_rate": 7.277281827919691e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14329658.0, + "repeat_count": 0.0, + "routers_loss": 0.0024797592777758837, + "skip_count": 1.0, + "step": 8886, + "text_loss": 0.47276070713996887 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.72791312004696, + "f1_execute": 0.9795917868614197, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037109375, + "learning_rate": 7.26120981264496e-05, + "loss": 0.0081, + "macro_f1": 0.6598639488220215, + "num_tokens": 14333584.0, + "repeat_count": 1.0, + "routers_loss": 0.023670634254813194, + "skip_count": 3.0, + "step": 8888, + "text_loss": 0.47537583112716675 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.73730554740241, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.04541015625, + "learning_rate": 7.245154174955254e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14336850.0, + "repeat_count": 0.0, + "routers_loss": 0.0009583478095009923, + "skip_count": 0.0, + "step": 8890, + "text_loss": 0.5258943438529968 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 41.74669797475785, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.024169921875, + "learning_rate": 7.229114921003116e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14339940.0, + "repeat_count": 0.0, + "routers_loss": 0.006664840504527092, + "skip_count": 3.0, + "step": 8892, + "text_loss": 0.20986922085285187 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.756090402113294, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.03857421875, + "learning_rate": 7.213092056934833e-05, + "loss": 0.0057, + "macro_f1": 0.3333333432674408, + "num_tokens": 14342737.0, + "repeat_count": 0.0, + "routers_loss": 0.0005362578085623682, + "skip_count": 0.0, + "step": 8894, + "text_loss": 0.5174402594566345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.76548282946874, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.037353515625, + "learning_rate": 7.197085588890383e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14345769.0, + "repeat_count": 0.0, + "routers_loss": 0.006428950000554323, + "skip_count": 1.0, + "step": 8896, + "text_loss": 0.657136857509613 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.77487525682419, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.0257568359375, + "learning_rate": 7.181095523003478e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14348563.0, + "repeat_count": 1.0, + "routers_loss": 0.0015549053205177188, + "skip_count": 0.0, + "step": 8898, + "text_loss": 0.49799686670303345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.78426768417963, + "f1_execute": 0.9629629254341125, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0281982421875, + "learning_rate": 7.165121865401535e-05, + "loss": 0.0068, + "macro_f1": 0.32098764181137085, + "num_tokens": 14353134.0, + "repeat_count": 0.0, + "routers_loss": 0.030110027641057968, + "skip_count": 2.0, + "step": 8900, + "text_loss": 0.3644331693649292 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.793660111535075, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.03466796875, + "learning_rate": 7.149164622205712e-05, + "loss": 0.0072, + "macro_f1": 1.0, + "num_tokens": 14356031.0, + "repeat_count": 1.0, + "routers_loss": 0.0014812488807365298, + "skip_count": 1.0, + "step": 8902, + "text_loss": 0.46983054280281067 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.80305253889052, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 7.133223799530836e-05, + "loss": 0.0037, + "macro_f1": 0.3333333432674408, + "num_tokens": 14358941.0, + "repeat_count": 0.0, + "routers_loss": 0.001170543720945716, + "skip_count": 0.0, + "step": 8904, + "text_loss": 0.7030026316642761 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 41.81244496624596, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.034912109375, + "learning_rate": 7.117299403485466e-05, + "loss": 0.0085, + "macro_f1": 1.0, + "num_tokens": 14361807.0, + "repeat_count": 1.0, + "routers_loss": 0.0011649372754618526, + "skip_count": 1.0, + "step": 8906, + "text_loss": 0.44989535212516785 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.821837393601406, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0213623046875, + "learning_rate": 7.101391440171856e-05, + "loss": 0.0054, + "macro_f1": 0.3333333432674408, + "num_tokens": 14365464.0, + "repeat_count": 0.0, + "routers_loss": 0.0028165180701762438, + "skip_count": 0.0, + "step": 8908, + "text_loss": 0.487165629863739 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.83122982095686, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03759765625, + "learning_rate": 7.085499915685978e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14368149.0, + "repeat_count": 0.0, + "routers_loss": 0.001956705003976822, + "skip_count": 2.0, + "step": 8910, + "text_loss": 0.3717629909515381 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.8406222483123, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.040283203125, + "learning_rate": 7.069624836117484e-05, + "loss": 0.0043, + "macro_f1": 0.6666666865348816, + "num_tokens": 14371440.0, + "repeat_count": 0.0, + "routers_loss": 0.0027164234779775143, + "skip_count": 1.0, + "step": 8912, + "text_loss": 0.3683965802192688 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.850014675667744, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.039794921875, + "learning_rate": 7.053766207549734e-05, + "loss": 0.009, + "macro_f1": 0.6666666865348816, + "num_tokens": 14374965.0, + "repeat_count": 0.0, + "routers_loss": 0.005999395158141851, + "skip_count": 2.0, + "step": 8914, + "text_loss": 0.6271854639053345 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.85940710302319, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0361328125, + "learning_rate": 7.037924036059789e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14378445.0, + "repeat_count": 0.0, + "routers_loss": 0.000978486379608512, + "skip_count": 0.0, + "step": 8916, + "text_loss": 0.5927628874778748 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.86879953037863, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02978515625, + "learning_rate": 7.022098327718401e-05, + "loss": 0.0056, + "macro_f1": 0.6666666865348816, + "num_tokens": 14382851.0, + "repeat_count": 0.0, + "routers_loss": 0.012569266371428967, + "skip_count": 1.0, + "step": 8918, + "text_loss": 0.4092319905757904 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.5, + "avg_layers": 27.0, + "epoch": 41.878191957734074, + "f1_execute": 0.9811320900917053, + "f1_repeat": 0.0, + "f1_skip": 0.6666666865348816, + "grad_norm": 0.03564453125, + "learning_rate": 7.006289088590007e-05, + "loss": 0.0065, + "macro_f1": 0.5492662787437439, + "num_tokens": 14386959.0, + "repeat_count": 0.0, + "routers_loss": 0.011032132431864738, + "skip_count": 2.0, + "step": 8920, + "text_loss": 0.6553854942321777 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.887584385089525, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.048095703125, + "learning_rate": 6.990496324732737e-05, + "loss": 0.0046, + "macro_f1": 0.3333333432674408, + "num_tokens": 14390031.0, + "repeat_count": 0.0, + "routers_loss": 0.001376329455524683, + "skip_count": 0.0, + "step": 8922, + "text_loss": 0.7792862057685852 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.89697681244497, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0361328125, + "learning_rate": 6.974720042198396e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14392966.0, + "repeat_count": 0.0, + "routers_loss": 0.005924372002482414, + "skip_count": 2.0, + "step": 8924, + "text_loss": 0.4466548562049866 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.90636923980041, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.033203125, + "learning_rate": 6.958960247032515e-05, + "loss": 0.0059, + "macro_f1": 0.6666666865348816, + "num_tokens": 14395619.0, + "repeat_count": 0.0, + "routers_loss": 0.010054769925773144, + "skip_count": 2.0, + "step": 8926, + "text_loss": 0.24784758687019348 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.915761667155856, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0208740234375, + "learning_rate": 6.943216945274255e-05, + "loss": 0.0039, + "macro_f1": 0.3333333432674408, + "num_tokens": 14398891.0, + "repeat_count": 0.0, + "routers_loss": 0.0006864808965474367, + "skip_count": 0.0, + "step": 8928, + "text_loss": 0.5154114961624146 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.9251540945113, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.927490142956489e-05, + "loss": 0.0042, + "macro_f1": 0.3333333432674408, + "num_tokens": 14402991.0, + "repeat_count": 0.0, + "routers_loss": 0.000996887218207121, + "skip_count": 0.0, + "step": 8930, + "text_loss": 0.5888006091117859 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 41.93454652186674, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04443359375, + "learning_rate": 6.911779846105753e-05, + "loss": 0.0062, + "macro_f1": 0.6666666865348816, + "num_tokens": 14406276.0, + "repeat_count": 1.0, + "routers_loss": 0.0007863475475460291, + "skip_count": 0.0, + "step": 8932, + "text_loss": 0.6862632632255554 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.943938949222186, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.02197265625, + "learning_rate": 6.896086060742262e-05, + "loss": 0.0055, + "macro_f1": 0.6666666865348816, + "num_tokens": 14409005.0, + "repeat_count": 0.0, + "routers_loss": 0.0020060581155121326, + "skip_count": 1.0, + "step": 8934, + "text_loss": 0.8998132348060608 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 41.95333137657764, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.036865234375, + "learning_rate": 6.880408792879905e-05, + "loss": 0.0047, + "macro_f1": 1.0, + "num_tokens": 14411902.0, + "repeat_count": 2.0, + "routers_loss": 0.008094016462564468, + "skip_count": 3.0, + "step": 8936, + "text_loss": 0.3411460518836975 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.96272380393308, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0247802734375, + "learning_rate": 6.864748048526237e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14414683.0, + "repeat_count": 0.0, + "routers_loss": 0.004374993033707142, + "skip_count": 0.0, + "step": 8938, + "text_loss": 0.24222217500209808 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.972116231288524, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.043212890625, + "learning_rate": 6.84910383368249e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14417740.0, + "repeat_count": 0.0, + "routers_loss": 0.003004335332661867, + "skip_count": 2.0, + "step": 8940, + "text_loss": 0.5524137020111084 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 41.98150865864397, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.032958984375, + "learning_rate": 6.83347615434356e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14420678.0, + "repeat_count": 0.0, + "routers_loss": 0.007001105695962906, + "skip_count": 2.0, + "step": 8942, + "text_loss": 0.3124033212661743 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 41.99090108599941, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.817865016497993e-05, + "loss": 0.0055, + "macro_f1": 0.3333333432674408, + "num_tokens": 14424259.0, + "repeat_count": 0.0, + "routers_loss": 0.0038414683658629656, + "skip_count": 0.0, + "step": 8944, + "text_loss": 0.509667694568634 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.0, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.060791015625, + "learning_rate": 6.80227042612801e-05, + "loss": 0.0067, + "macro_f1": 0.6666666865348816, + "num_tokens": 14427084.0, + "repeat_count": 1.0, + "routers_loss": 0.008573584258556366, + "skip_count": 0.0, + "step": 8946, + "text_loss": 0.2533438205718994 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.00939242735544, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.035888671875, + "learning_rate": 6.786692389209482e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14429690.0, + "repeat_count": 1.0, + "routers_loss": 0.003758789971470833, + "skip_count": 2.0, + "step": 8948, + "text_loss": 0.14571085572242737 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.01878485471089, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.06640625, + "learning_rate": 6.771130911711953e-05, + "loss": 0.0078, + "macro_f1": 0.6666666865348816, + "num_tokens": 14432983.0, + "repeat_count": 0.0, + "routers_loss": 0.005996126215904951, + "skip_count": 2.0, + "step": 8950, + "text_loss": 0.24994049966335297 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.02817728206633, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.755585999598613e-05, + "loss": 0.0032, + "macro_f1": 0.3333333432674408, + "num_tokens": 14435772.0, + "repeat_count": 0.0, + "routers_loss": 0.0012271527666598558, + "skip_count": 0.0, + "step": 8952, + "text_loss": 0.3705698549747467 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.03756970942178, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0673828125, + "learning_rate": 6.740057658826293e-05, + "loss": 0.0081, + "macro_f1": 1.0, + "num_tokens": 14438912.0, + "repeat_count": 1.0, + "routers_loss": 0.0017618577694520354, + "skip_count": 1.0, + "step": 8954, + "text_loss": 0.6691124439239502 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.046962136777225, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0262451171875, + "learning_rate": 6.72454589534548e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14441959.0, + "repeat_count": 0.0, + "routers_loss": 0.0016956349136307836, + "skip_count": 1.0, + "step": 8956, + "text_loss": 0.45412346720695496 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.05635456413267, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0250244140625, + "learning_rate": 6.709050715100324e-05, + "loss": 0.0038, + "macro_f1": 0.6666666865348816, + "num_tokens": 14444804.0, + "repeat_count": 0.0, + "routers_loss": 0.017321301624178886, + "skip_count": 2.0, + "step": 8958, + "text_loss": 0.2668265998363495 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.06574699148811, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0216064453125, + "learning_rate": 6.69357212402859e-05, + "loss": 0.0057, + "macro_f1": 0.6666666865348816, + "num_tokens": 14447390.0, + "repeat_count": 0.0, + "routers_loss": 0.005267233122140169, + "skip_count": 2.0, + "step": 8960, + "text_loss": 0.35546016693115234 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 25.0, + "epoch": 42.075139418843555, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.017578125, + "learning_rate": 6.67811012806172e-05, + "loss": 0.004, + "macro_f1": 0.6666666865348816, + "num_tokens": 14451286.0, + "repeat_count": 0.0, + "routers_loss": 0.0045175012201070786, + "skip_count": 3.0, + "step": 8962, + "text_loss": 0.14669834077358246 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.084531846199, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0211181640625, + "learning_rate": 6.662664733124768e-05, + "loss": 0.0064, + "macro_f1": 1.0, + "num_tokens": 14454335.0, + "repeat_count": 1.0, + "routers_loss": 0.004905698820948601, + "skip_count": 3.0, + "step": 8964, + "text_loss": 0.28777357935905457 + }, + { + "acc_repeat": 0.5, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.09392427355445, + "f1_execute": 0.9803921580314636, + "f1_repeat": 0.6666666865348816, + "f1_skip": 1.0, + "grad_norm": 0.0224609375, + "learning_rate": 6.647235945136442e-05, + "loss": 0.0074, + "macro_f1": 0.8823530077934265, + "num_tokens": 14457708.0, + "repeat_count": 2.0, + "routers_loss": 0.032136883586645126, + "skip_count": 1.0, + "step": 8966, + "text_loss": 0.2317836582660675 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 28.0, + "epoch": 42.10331670090989, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.022705078125, + "learning_rate": 6.631823770009088e-05, + "loss": 0.0054, + "macro_f1": 1.0, + "num_tokens": 14460721.0, + "repeat_count": 1.0, + "routers_loss": 0.0038611628115177155, + "skip_count": 1.0, + "step": 8968, + "text_loss": 0.28979742527008057 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.11270912826534, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.02294921875, + "learning_rate": 6.616428213648656e-05, + "loss": 0.0056, + "macro_f1": 0.3333333432674408, + "num_tokens": 14463467.0, + "repeat_count": 0.0, + "routers_loss": 0.0006560821202583611, + "skip_count": 0.0, + "step": 8970, + "text_loss": 0.3474387526512146 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.12210155562078, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.020751953125, + "learning_rate": 6.60104928195479e-05, + "loss": 0.0074, + "macro_f1": 0.6666666865348816, + "num_tokens": 14466586.0, + "repeat_count": 1.0, + "routers_loss": 0.0016879125032573938, + "skip_count": 0.0, + "step": 8972, + "text_loss": 0.5454491972923279 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.131493982976224, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0240478515625, + "learning_rate": 6.58568698082071e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14470125.0, + "repeat_count": 0.0, + "routers_loss": 0.0004945555119775236, + "skip_count": 0.0, + "step": 8974, + "text_loss": 0.4728975296020508 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.14088641033167, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.021240234375, + "learning_rate": 6.570341316133272e-05, + "loss": 0.0043, + "macro_f1": 1.0, + "num_tokens": 14473887.0, + "repeat_count": 2.0, + "routers_loss": 0.010141569189727306, + "skip_count": 3.0, + "step": 8976, + "text_loss": 0.24756617844104767 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.15027883768712, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0322265625, + "learning_rate": 6.555012293772967e-05, + "loss": 0.0051, + "macro_f1": 1.0, + "num_tokens": 14477046.0, + "repeat_count": 1.0, + "routers_loss": 0.011950359679758549, + "skip_count": 2.0, + "step": 8978, + "text_loss": 0.25375646352767944 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.15967126504256, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.019775390625, + "learning_rate": 6.539699919613911e-05, + "loss": 0.0049, + "macro_f1": 0.3333333432674408, + "num_tokens": 14480638.0, + "repeat_count": 0.0, + "routers_loss": 0.0007824545609764755, + "skip_count": 0.0, + "step": 8980, + "text_loss": 0.6888379454612732 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.169063692398005, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.03125, + "learning_rate": 6.524404199523826e-05, + "loss": 0.006, + "macro_f1": 0.6666666865348816, + "num_tokens": 14483723.0, + "repeat_count": 0.0, + "routers_loss": 0.004318726249039173, + "skip_count": 1.0, + "step": 8982, + "text_loss": 0.3603152334690094 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.17845611975345, + "f1_execute": 0.9818181991577148, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.037109375, + "learning_rate": 6.509125139364058e-05, + "loss": 0.0064, + "macro_f1": 0.3272727429866791, + "num_tokens": 14486876.0, + "repeat_count": 0.0, + "routers_loss": 0.010652635246515274, + "skip_count": 1.0, + "step": 8984, + "text_loss": 0.43394285440444946 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.18784854710889, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0238037109375, + "learning_rate": 6.493862744989587e-05, + "loss": 0.0081, + "macro_f1": 0.3333333432674408, + "num_tokens": 14489944.0, + "repeat_count": 0.0, + "routers_loss": 0.0010475299786776304, + "skip_count": 0.0, + "step": 8986, + "text_loss": 0.5952020287513733 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.197240974464336, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.031982421875, + "learning_rate": 6.478617022248984e-05, + "loss": 0.0053, + "macro_f1": 0.6666666865348816, + "num_tokens": 14493094.0, + "repeat_count": 0.0, + "routers_loss": 0.004329503979533911, + "skip_count": 1.0, + "step": 8988, + "text_loss": 0.7284399271011353 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.20663340181978, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.023681640625, + "learning_rate": 6.463387976984437e-05, + "loss": 0.0052, + "macro_f1": 0.6666666865348816, + "num_tokens": 14496944.0, + "repeat_count": 0.0, + "routers_loss": 0.0019588395953178406, + "skip_count": 1.0, + "step": 8990, + "text_loss": 0.8103306889533997 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 27.0, + "epoch": 42.21602582917523, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0308837890625, + "learning_rate": 6.448175615031749e-05, + "loss": 0.0044, + "macro_f1": 0.6666666865348816, + "num_tokens": 14499997.0, + "repeat_count": 0.0, + "routers_loss": 0.008046228438615799, + "skip_count": 1.0, + "step": 8992, + "text_loss": 0.14758773148059845 + }, + { + "acc_repeat": 1.0, + "acc_skip": 0.0, + "avg_layers": 29.0, + "epoch": 42.22541825653067, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 0.0, + "grad_norm": 0.04638671875, + "learning_rate": 6.432979942220319e-05, + "loss": 0.0082, + "macro_f1": 0.6666666865348816, + "num_tokens": 14503247.0, + "repeat_count": 1.0, + "routers_loss": 0.0028899910394102335, + "skip_count": 0.0, + "step": 8994, + "text_loss": 0.2568151652812958 + }, + { + "acc_repeat": 0.0, + "acc_skip": 1.0, + "avg_layers": 26.0, + "epoch": 42.23481068388612, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 1.0, + "grad_norm": 0.0311279296875, + "learning_rate": 6.417800964373161e-05, + "loss": 0.0045, + "macro_f1": 0.6666666865348816, + "num_tokens": 14506244.0, + "repeat_count": 0.0, + "routers_loss": 0.0042211092077195644, + "skip_count": 2.0, + "step": 8996, + "text_loss": 0.3506850600242615 + }, + { + "acc_repeat": 0.0, + "acc_skip": 0.0, + "avg_layers": 28.0, + "epoch": 42.24420311124156, + "f1_execute": 1.0, + "f1_repeat": 0.0, + "f1_skip": 0.0, + "grad_norm": 0.0244140625, + "learning_rate": 6.402638687306872e-05, + "loss": 0.0038, + "macro_f1": 0.3333333432674408, + "num_tokens": 14510502.0, + "repeat_count": 0.0, + "routers_loss": 0.003309462917968631, + "skip_count": 0.0, + "step": 8998, + "text_loss": 0.5852319598197937 + }, + { + "acc_repeat": 1.0, + "acc_skip": 1.0, + "avg_layers": 24.0, + "epoch": 42.253595538597004, + "f1_execute": 1.0, + "f1_repeat": 1.0, + "f1_skip": 1.0, + "grad_norm": 0.0303955078125, + "learning_rate": 6.387493116831699e-05, + "loss": 0.005, + "macro_f1": 1.0, + "num_tokens": 14513679.0, + "repeat_count": 1.0, + "routers_loss": 0.015246274881064892, + "skip_count": 5.0, + "step": 9000, + "text_loss": 0.4266709089279175 + } + ], + "logging_steps": 2, + "max_steps": 10650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4642240251563846e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9000/training_args.bin b/checkpoint-9000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3d3ae372faf14539639f54454aa52b6ee730c4a --- /dev/null +++ b/checkpoint-9000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fc67a0218c3f1c750719f090b09e231bab97de5077e612026ee330c5558dd8 +size 5880